Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 6f08c541

History | View | Annotate | Download (155 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "h263.h"
37
#include "snow.h"
38

    
39
/* snow.c */
40
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41

    
42
/* vorbis.c */
43
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44

    
45
/* flacenc.c */
46
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
47

    
48
/* pngdec.c */
49
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
50

    
51
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
52
uint32_t ff_squareTbl[512] = {0, };
53

    
54
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55
#define pb_7f (~0UL/255 * 0x7f)
56
#define pb_80 (~0UL/255 * 0x80)
57

    
58
const uint8_t ff_zigzag_direct[64] = {
59
    0,   1,  8, 16,  9,  2,  3, 10,
60
    17, 24, 32, 25, 18, 11,  4,  5,
61
    12, 19, 26, 33, 40, 48, 41, 34,
62
    27, 20, 13,  6,  7, 14, 21, 28,
63
    35, 42, 49, 56, 57, 50, 43, 36,
64
    29, 22, 15, 23, 30, 37, 44, 51,
65
    58, 59, 52, 45, 38, 31, 39, 46,
66
    53, 60, 61, 54, 47, 55, 62, 63
67
};
68

    
69
/* Specific zigzag scan for 248 idct. NOTE that unlike the
70
   specification, we interleave the fields */
71
const uint8_t ff_zigzag248_direct[64] = {
72
     0,  8,  1,  9, 16, 24,  2, 10,
73
    17, 25, 32, 40, 48, 56, 33, 41,
74
    18, 26,  3, 11,  4, 12, 19, 27,
75
    34, 42, 49, 57, 50, 58, 35, 43,
76
    20, 28,  5, 13,  6, 14, 21, 29,
77
    36, 44, 51, 59, 52, 60, 37, 45,
78
    22, 30,  7, 15, 23, 31, 38, 46,
79
    53, 61, 54, 62, 39, 47, 55, 63,
80
};
81

    
82
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
83
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
84

    
85
const uint8_t ff_alternate_horizontal_scan[64] = {
86
    0,  1,   2,  3,  8,  9, 16, 17,
87
    10, 11,  4,  5,  6,  7, 15, 14,
88
    13, 12, 19, 18, 24, 25, 32, 33,
89
    26, 27, 20, 21, 22, 23, 28, 29,
90
    30, 31, 34, 35, 40, 41, 48, 49,
91
    42, 43, 36, 37, 38, 39, 44, 45,
92
    46, 47, 50, 51, 56, 57, 58, 59,
93
    52, 53, 54, 55, 60, 61, 62, 63,
94
};
95

    
96
const uint8_t ff_alternate_vertical_scan[64] = {
97
    0,  8,  16, 24,  1,  9,  2, 10,
98
    17, 25, 32, 40, 48, 56, 57, 49,
99
    41, 33, 26, 18,  3, 11,  4, 12,
100
    19, 27, 34, 42, 50, 58, 35, 43,
101
    51, 59, 20, 28,  5, 13,  6, 14,
102
    21, 29, 36, 44, 52, 60, 37, 45,
103
    53, 61, 22, 30,  7, 15, 23, 31,
104
    38, 46, 54, 62, 39, 47, 55, 63,
105
};
106

    
107
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
108
const uint32_t ff_inverse[256]={
109
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
110
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
111
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
112
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
113
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
114
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
115
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
116
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
117
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
118
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
119
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
120
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
121
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
122
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
123
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
124
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
125
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
126
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
127
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
128
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
129
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
130
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
131
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
132
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
133
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
134
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
135
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
136
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
137
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
138
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
139
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
140
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
141
};
142

    
143
/* Input permutation for the simple_idct_mmx */
144
static const uint8_t simple_mmx_permutation[64]={
145
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
146
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
147
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
148
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
149
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
150
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
151
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
152
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
153
};
154

    
155
static int pix_sum_c(uint8_t * pix, int line_size)
156
{
157
    int s, i, j;
158

    
159
    s = 0;
160
    for (i = 0; i < 16; i++) {
161
        for (j = 0; j < 16; j += 8) {
162
            s += pix[0];
163
            s += pix[1];
164
            s += pix[2];
165
            s += pix[3];
166
            s += pix[4];
167
            s += pix[5];
168
            s += pix[6];
169
            s += pix[7];
170
            pix += 8;
171
        }
172
        pix += line_size - 16;
173
    }
174
    return s;
175
}
176

    
177
static int pix_norm1_c(uint8_t * pix, int line_size)
178
{
179
    int s, i, j;
180
    uint32_t *sq = ff_squareTbl + 256;
181

    
182
    s = 0;
183
    for (i = 0; i < 16; i++) {
184
        for (j = 0; j < 16; j += 8) {
185
#if 0
186
            s += sq[pix[0]];
187
            s += sq[pix[1]];
188
            s += sq[pix[2]];
189
            s += sq[pix[3]];
190
            s += sq[pix[4]];
191
            s += sq[pix[5]];
192
            s += sq[pix[6]];
193
            s += sq[pix[7]];
194
#else
195
#if LONG_MAX > 2147483647
196
            register uint64_t x=*(uint64_t*)pix;
197
            s += sq[x&0xff];
198
            s += sq[(x>>8)&0xff];
199
            s += sq[(x>>16)&0xff];
200
            s += sq[(x>>24)&0xff];
201
            s += sq[(x>>32)&0xff];
202
            s += sq[(x>>40)&0xff];
203
            s += sq[(x>>48)&0xff];
204
            s += sq[(x>>56)&0xff];
205
#else
206
            register uint32_t x=*(uint32_t*)pix;
207
            s += sq[x&0xff];
208
            s += sq[(x>>8)&0xff];
209
            s += sq[(x>>16)&0xff];
210
            s += sq[(x>>24)&0xff];
211
            x=*(uint32_t*)(pix+4);
212
            s += sq[x&0xff];
213
            s += sq[(x>>8)&0xff];
214
            s += sq[(x>>16)&0xff];
215
            s += sq[(x>>24)&0xff];
216
#endif
217
#endif
218
            pix += 8;
219
        }
220
        pix += line_size - 16;
221
    }
222
    return s;
223
}
224

    
225
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
226
    int i;
227

    
228
    for(i=0; i+8<=w; i+=8){
229
        dst[i+0]= bswap_32(src[i+0]);
230
        dst[i+1]= bswap_32(src[i+1]);
231
        dst[i+2]= bswap_32(src[i+2]);
232
        dst[i+3]= bswap_32(src[i+3]);
233
        dst[i+4]= bswap_32(src[i+4]);
234
        dst[i+5]= bswap_32(src[i+5]);
235
        dst[i+6]= bswap_32(src[i+6]);
236
        dst[i+7]= bswap_32(src[i+7]);
237
    }
238
    for(;i<w; i++){
239
        dst[i+0]= bswap_32(src[i+0]);
240
    }
241
}
242

    
243
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = ff_squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[0] - pix2[0]];
251
        s += sq[pix1[1] - pix2[1]];
252
        s += sq[pix1[2] - pix2[2]];
253
        s += sq[pix1[3] - pix2[3]];
254
        pix1 += line_size;
255
        pix2 += line_size;
256
    }
257
    return s;
258
}
259

    
260
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
261
{
262
    int s, i;
263
    uint32_t *sq = ff_squareTbl + 256;
264

    
265
    s = 0;
266
    for (i = 0; i < h; i++) {
267
        s += sq[pix1[0] - pix2[0]];
268
        s += sq[pix1[1] - pix2[1]];
269
        s += sq[pix1[2] - pix2[2]];
270
        s += sq[pix1[3] - pix2[3]];
271
        s += sq[pix1[4] - pix2[4]];
272
        s += sq[pix1[5] - pix2[5]];
273
        s += sq[pix1[6] - pix2[6]];
274
        s += sq[pix1[7] - pix2[7]];
275
        pix1 += line_size;
276
        pix2 += line_size;
277
    }
278
    return s;
279
}
280

    
281
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
282
{
283
    int s, i;
284
    uint32_t *sq = ff_squareTbl + 256;
285

    
286
    s = 0;
287
    for (i = 0; i < h; i++) {
288
        s += sq[pix1[ 0] - pix2[ 0]];
289
        s += sq[pix1[ 1] - pix2[ 1]];
290
        s += sq[pix1[ 2] - pix2[ 2]];
291
        s += sq[pix1[ 3] - pix2[ 3]];
292
        s += sq[pix1[ 4] - pix2[ 4]];
293
        s += sq[pix1[ 5] - pix2[ 5]];
294
        s += sq[pix1[ 6] - pix2[ 6]];
295
        s += sq[pix1[ 7] - pix2[ 7]];
296
        s += sq[pix1[ 8] - pix2[ 8]];
297
        s += sq[pix1[ 9] - pix2[ 9]];
298
        s += sq[pix1[10] - pix2[10]];
299
        s += sq[pix1[11] - pix2[11]];
300
        s += sq[pix1[12] - pix2[12]];
301
        s += sq[pix1[13] - pix2[13]];
302
        s += sq[pix1[14] - pix2[14]];
303
        s += sq[pix1[15] - pix2[15]];
304

    
305
        pix1 += line_size;
306
        pix2 += line_size;
307
    }
308
    return s;
309
}
310

    
311

    
312
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
313
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
314
    int s, i, j;
315
    const int dec_count= w==8 ? 3 : 4;
316
    int tmp[32*32];
317
    int level, ori;
318
    static const int scale[2][2][4][4]={
319
      {
320
        {
321
            // 9/7 8x8 dec=3
322
            {268, 239, 239, 213},
323
            {  0, 224, 224, 152},
324
            {  0, 135, 135, 110},
325
        },{
326
            // 9/7 16x16 or 32x32 dec=4
327
            {344, 310, 310, 280},
328
            {  0, 320, 320, 228},
329
            {  0, 175, 175, 136},
330
            {  0, 129, 129, 102},
331
        }
332
      },{
333
        {
334
            // 5/3 8x8 dec=3
335
            {275, 245, 245, 218},
336
            {  0, 230, 230, 156},
337
            {  0, 138, 138, 113},
338
        },{
339
            // 5/3 16x16 or 32x32 dec=4
340
            {352, 317, 317, 286},
341
            {  0, 328, 328, 233},
342
            {  0, 180, 180, 140},
343
            {  0, 132, 132, 105},
344
        }
345
      }
346
    };
347

    
348
    for (i = 0; i < h; i++) {
349
        for (j = 0; j < w; j+=4) {
350
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
351
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
352
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
353
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
354
        }
355
        pix1 += line_size;
356
        pix2 += line_size;
357
    }
358

    
359
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
360

    
361
    s=0;
362
    assert(w==h);
363
    for(level=0; level<dec_count; level++){
364
        for(ori= level ? 1 : 0; ori<4; ori++){
365
            int size= w>>(dec_count-level);
366
            int sx= (ori&1) ? size : 0;
367
            int stride= 32<<(dec_count-level);
368
            int sy= (ori&2) ? stride>>1 : 0;
369

    
370
            for(i=0; i<size; i++){
371
                for(j=0; j<size; j++){
372
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
373
                    s += FFABS(v);
374
                }
375
            }
376
        }
377
    }
378
    assert(s>=0);
379
    return s>>9;
380
}
381

    
382
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
384
}
385

    
386
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
388
}
389

    
390
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
392
}
393

    
394
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
396
}
397

    
398
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
399
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
400
}
401

    
402
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
403
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
404
}
405
#endif
406

    
407
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
408
{
409
    int i;
410

    
411
    /* read the pixels */
412
    for(i=0;i<8;i++) {
413
        block[0] = pixels[0];
414
        block[1] = pixels[1];
415
        block[2] = pixels[2];
416
        block[3] = pixels[3];
417
        block[4] = pixels[4];
418
        block[5] = pixels[5];
419
        block[6] = pixels[6];
420
        block[7] = pixels[7];
421
        pixels += line_size;
422
        block += 8;
423
    }
424
}
425

    
426
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
427
                          const uint8_t *s2, int stride){
428
    int i;
429

    
430
    /* read the pixels */
431
    for(i=0;i<8;i++) {
432
        block[0] = s1[0] - s2[0];
433
        block[1] = s1[1] - s2[1];
434
        block[2] = s1[2] - s2[2];
435
        block[3] = s1[3] - s2[3];
436
        block[4] = s1[4] - s2[4];
437
        block[5] = s1[5] - s2[5];
438
        block[6] = s1[6] - s2[6];
439
        block[7] = s1[7] - s2[7];
440
        s1 += stride;
441
        s2 += stride;
442
        block += 8;
443
    }
444
}
445

    
446

    
447
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
448
                                 int line_size)
449
{
450
    int i;
451
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
452

    
453
    /* read the pixels */
454
    for(i=0;i<8;i++) {
455
        pixels[0] = cm[block[0]];
456
        pixels[1] = cm[block[1]];
457
        pixels[2] = cm[block[2]];
458
        pixels[3] = cm[block[3]];
459
        pixels[4] = cm[block[4]];
460
        pixels[5] = cm[block[5]];
461
        pixels[6] = cm[block[6]];
462
        pixels[7] = cm[block[7]];
463

    
464
        pixels += line_size;
465
        block += 8;
466
    }
467
}
468

    
469
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
470
                                 int line_size)
471
{
472
    int i;
473
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
474

    
475
    /* read the pixels */
476
    for(i=0;i<4;i++) {
477
        pixels[0] = cm[block[0]];
478
        pixels[1] = cm[block[1]];
479
        pixels[2] = cm[block[2]];
480
        pixels[3] = cm[block[3]];
481

    
482
        pixels += line_size;
483
        block += 8;
484
    }
485
}
486

    
487
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
488
                                 int line_size)
489
{
490
    int i;
491
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
492

    
493
    /* read the pixels */
494
    for(i=0;i<2;i++) {
495
        pixels[0] = cm[block[0]];
496
        pixels[1] = cm[block[1]];
497

    
498
        pixels += line_size;
499
        block += 8;
500
    }
501
}
502

    
503
static void put_signed_pixels_clamped_c(const DCTELEM *block,
504
                                        uint8_t *restrict pixels,
505
                                        int line_size)
506
{
507
    int i, j;
508

    
509
    for (i = 0; i < 8; i++) {
510
        for (j = 0; j < 8; j++) {
511
            if (*block < -128)
512
                *pixels = 0;
513
            else if (*block > 127)
514
                *pixels = 255;
515
            else
516
                *pixels = (uint8_t)(*block + 128);
517
            block++;
518
            pixels++;
519
        }
520
        pixels += (line_size - 8);
521
    }
522
}
523

    
524
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
525
                          int line_size)
526
{
527
    int i;
528
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
529

    
530
    /* read the pixels */
531
    for(i=0;i<8;i++) {
532
        pixels[0] = cm[pixels[0] + block[0]];
533
        pixels[1] = cm[pixels[1] + block[1]];
534
        pixels[2] = cm[pixels[2] + block[2]];
535
        pixels[3] = cm[pixels[3] + block[3]];
536
        pixels[4] = cm[pixels[4] + block[4]];
537
        pixels[5] = cm[pixels[5] + block[5]];
538
        pixels[6] = cm[pixels[6] + block[6]];
539
        pixels[7] = cm[pixels[7] + block[7]];
540
        pixels += line_size;
541
        block += 8;
542
    }
543
}
544

    
545
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
546
                          int line_size)
547
{
548
    int i;
549
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
550

    
551
    /* read the pixels */
552
    for(i=0;i<4;i++) {
553
        pixels[0] = cm[pixels[0] + block[0]];
554
        pixels[1] = cm[pixels[1] + block[1]];
555
        pixels[2] = cm[pixels[2] + block[2]];
556
        pixels[3] = cm[pixels[3] + block[3]];
557
        pixels += line_size;
558
        block += 8;
559
    }
560
}
561

    
562
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
563
                          int line_size)
564
{
565
    int i;
566
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
567

    
568
    /* read the pixels */
569
    for(i=0;i<2;i++) {
570
        pixels[0] = cm[pixels[0] + block[0]];
571
        pixels[1] = cm[pixels[1] + block[1]];
572
        pixels += line_size;
573
        block += 8;
574
    }
575
}
576

    
577
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
578
{
579
    int i;
580
    for(i=0;i<8;i++) {
581
        pixels[0] += block[0];
582
        pixels[1] += block[1];
583
        pixels[2] += block[2];
584
        pixels[3] += block[3];
585
        pixels[4] += block[4];
586
        pixels[5] += block[5];
587
        pixels[6] += block[6];
588
        pixels[7] += block[7];
589
        pixels += line_size;
590
        block += 8;
591
    }
592
}
593

    
594
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
595
{
596
    int i;
597
    for(i=0;i<4;i++) {
598
        pixels[0] += block[0];
599
        pixels[1] += block[1];
600
        pixels[2] += block[2];
601
        pixels[3] += block[3];
602
        pixels += line_size;
603
        block += 4;
604
    }
605
}
606

    
607
static int sum_abs_dctelem_c(DCTELEM *block)
608
{
609
    int sum=0, i;
610
    for(i=0; i<64; i++)
611
        sum+= FFABS(block[i]);
612
    return sum;
613
}
614

    
615
#if 0
616

617
#define PIXOP2(OPNAME, OP) \
618
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
619
{\
620
    int i;\
621
    for(i=0; i<h; i++){\
622
        OP(*((uint64_t*)block), AV_RN64(pixels));\
623
        pixels+=line_size;\
624
        block +=line_size;\
625
    }\
626
}\
627
\
628
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
629
{\
630
    int i;\
631
    for(i=0; i<h; i++){\
632
        const uint64_t a= AV_RN64(pixels  );\
633
        const uint64_t b= AV_RN64(pixels+1);\
634
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
635
        pixels+=line_size;\
636
        block +=line_size;\
637
    }\
638
}\
639
\
640
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
641
{\
642
    int i;\
643
    for(i=0; i<h; i++){\
644
        const uint64_t a= AV_RN64(pixels  );\
645
        const uint64_t b= AV_RN64(pixels+1);\
646
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
647
        pixels+=line_size;\
648
        block +=line_size;\
649
    }\
650
}\
651
\
652
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
653
{\
654
    int i;\
655
    for(i=0; i<h; i++){\
656
        const uint64_t a= AV_RN64(pixels          );\
657
        const uint64_t b= AV_RN64(pixels+line_size);\
658
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
659
        pixels+=line_size;\
660
        block +=line_size;\
661
    }\
662
}\
663
\
664
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
665
{\
666
    int i;\
667
    for(i=0; i<h; i++){\
668
        const uint64_t a= AV_RN64(pixels          );\
669
        const uint64_t b= AV_RN64(pixels+line_size);\
670
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
671
        pixels+=line_size;\
672
        block +=line_size;\
673
    }\
674
}\
675
\
676
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
677
{\
678
        int i;\
679
        const uint64_t a= AV_RN64(pixels  );\
680
        const uint64_t b= AV_RN64(pixels+1);\
681
        uint64_t l0=  (a&0x0303030303030303ULL)\
682
                    + (b&0x0303030303030303ULL)\
683
                    + 0x0202020202020202ULL;\
684
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686
        uint64_t l1,h1;\
687
\
688
        pixels+=line_size;\
689
        for(i=0; i<h; i+=2){\
690
            uint64_t a= AV_RN64(pixels  );\
691
            uint64_t b= AV_RN64(pixels+1);\
692
            l1=  (a&0x0303030303030303ULL)\
693
               + (b&0x0303030303030303ULL);\
694
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
695
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
696
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
697
            pixels+=line_size;\
698
            block +=line_size;\
699
            a= AV_RN64(pixels  );\
700
            b= AV_RN64(pixels+1);\
701
            l0=  (a&0x0303030303030303ULL)\
702
               + (b&0x0303030303030303ULL)\
703
               + 0x0202020202020202ULL;\
704
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
705
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
706
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
707
            pixels+=line_size;\
708
            block +=line_size;\
709
        }\
710
}\
711
\
712
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
713
{\
714
        int i;\
715
        const uint64_t a= AV_RN64(pixels  );\
716
        const uint64_t b= AV_RN64(pixels+1);\
717
        uint64_t l0=  (a&0x0303030303030303ULL)\
718
                    + (b&0x0303030303030303ULL)\
719
                    + 0x0101010101010101ULL;\
720
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722
        uint64_t l1,h1;\
723
\
724
        pixels+=line_size;\
725
        for(i=0; i<h; i+=2){\
726
            uint64_t a= AV_RN64(pixels  );\
727
            uint64_t b= AV_RN64(pixels+1);\
728
            l1=  (a&0x0303030303030303ULL)\
729
               + (b&0x0303030303030303ULL);\
730
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
731
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
732
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
733
            pixels+=line_size;\
734
            block +=line_size;\
735
            a= AV_RN64(pixels  );\
736
            b= AV_RN64(pixels+1);\
737
            l0=  (a&0x0303030303030303ULL)\
738
               + (b&0x0303030303030303ULL)\
739
               + 0x0101010101010101ULL;\
740
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
741
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
742
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
743
            pixels+=line_size;\
744
            block +=line_size;\
745
        }\
746
}\
747
\
748
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
749
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
750
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
751
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
752
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
753
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
754
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
755

756
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
757
#else // 64 bit variant
758

    
759
#define PIXOP2(OPNAME, OP) \
760
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761
    int i;\
762
    for(i=0; i<h; i++){\
763
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
764
        pixels+=line_size;\
765
        block +=line_size;\
766
    }\
767
}\
768
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
769
    int i;\
770
    for(i=0; i<h; i++){\
771
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
772
        pixels+=line_size;\
773
        block +=line_size;\
774
    }\
775
}\
776
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
777
    int i;\
778
    for(i=0; i<h; i++){\
779
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
780
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
781
        pixels+=line_size;\
782
        block +=line_size;\
783
    }\
784
}\
785
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
786
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
787
}\
788
\
789
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
790
                                                int src_stride1, int src_stride2, int h){\
791
    int i;\
792
    for(i=0; i<h; i++){\
793
        uint32_t a,b;\
794
        a= AV_RN32(&src1[i*src_stride1  ]);\
795
        b= AV_RN32(&src2[i*src_stride2  ]);\
796
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
797
        a= AV_RN32(&src1[i*src_stride1+4]);\
798
        b= AV_RN32(&src2[i*src_stride2+4]);\
799
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
800
    }\
801
}\
802
\
803
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
804
                                                int src_stride1, int src_stride2, int h){\
805
    int i;\
806
    for(i=0; i<h; i++){\
807
        uint32_t a,b;\
808
        a= AV_RN32(&src1[i*src_stride1  ]);\
809
        b= AV_RN32(&src2[i*src_stride2  ]);\
810
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
811
        a= AV_RN32(&src1[i*src_stride1+4]);\
812
        b= AV_RN32(&src2[i*src_stride2+4]);\
813
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
814
    }\
815
}\
816
\
817
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818
                                                int src_stride1, int src_stride2, int h){\
819
    int i;\
820
    for(i=0; i<h; i++){\
821
        uint32_t a,b;\
822
        a= AV_RN32(&src1[i*src_stride1  ]);\
823
        b= AV_RN32(&src2[i*src_stride2  ]);\
824
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
825
    }\
826
}\
827
\
828
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
829
                                                int src_stride1, int src_stride2, int h){\
830
    int i;\
831
    for(i=0; i<h; i++){\
832
        uint32_t a,b;\
833
        a= AV_RN16(&src1[i*src_stride1  ]);\
834
        b= AV_RN16(&src2[i*src_stride2  ]);\
835
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
836
    }\
837
}\
838
\
839
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
840
                                                int src_stride1, int src_stride2, int h){\
841
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
842
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
843
}\
844
\
845
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
846
                                                int src_stride1, int src_stride2, int h){\
847
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
848
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
849
}\
850
\
851
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
853
}\
854
\
855
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
857
}\
858
\
859
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
860
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
861
}\
862
\
863
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
864
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
865
}\
866
\
867
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
868
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
869
    int i;\
870
    for(i=0; i<h; i++){\
871
        uint32_t a, b, c, d, l0, l1, h0, h1;\
872
        a= AV_RN32(&src1[i*src_stride1]);\
873
        b= AV_RN32(&src2[i*src_stride2]);\
874
        c= AV_RN32(&src3[i*src_stride3]);\
875
        d= AV_RN32(&src4[i*src_stride4]);\
876
        l0=  (a&0x03030303UL)\
877
           + (b&0x03030303UL)\
878
           + 0x02020202UL;\
879
        h0= ((a&0xFCFCFCFCUL)>>2)\
880
          + ((b&0xFCFCFCFCUL)>>2);\
881
        l1=  (c&0x03030303UL)\
882
           + (d&0x03030303UL);\
883
        h1= ((c&0xFCFCFCFCUL)>>2)\
884
          + ((d&0xFCFCFCFCUL)>>2);\
885
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
886
        a= AV_RN32(&src1[i*src_stride1+4]);\
887
        b= AV_RN32(&src2[i*src_stride2+4]);\
888
        c= AV_RN32(&src3[i*src_stride3+4]);\
889
        d= AV_RN32(&src4[i*src_stride4+4]);\
890
        l0=  (a&0x03030303UL)\
891
           + (b&0x03030303UL)\
892
           + 0x02020202UL;\
893
        h0= ((a&0xFCFCFCFCUL)>>2)\
894
          + ((b&0xFCFCFCFCUL)>>2);\
895
        l1=  (c&0x03030303UL)\
896
           + (d&0x03030303UL);\
897
        h1= ((c&0xFCFCFCFCUL)>>2)\
898
          + ((d&0xFCFCFCFCUL)>>2);\
899
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
900
    }\
901
}\
902
\
903
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
905
}\
906
\
907
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
909
}\
910
\
911
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
912
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
913
}\
914
\
915
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
916
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
917
}\
918
\
919
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
920
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
921
    int i;\
922
    for(i=0; i<h; i++){\
923
        uint32_t a, b, c, d, l0, l1, h0, h1;\
924
        a= AV_RN32(&src1[i*src_stride1]);\
925
        b= AV_RN32(&src2[i*src_stride2]);\
926
        c= AV_RN32(&src3[i*src_stride3]);\
927
        d= AV_RN32(&src4[i*src_stride4]);\
928
        l0=  (a&0x03030303UL)\
929
           + (b&0x03030303UL)\
930
           + 0x01010101UL;\
931
        h0= ((a&0xFCFCFCFCUL)>>2)\
932
          + ((b&0xFCFCFCFCUL)>>2);\
933
        l1=  (c&0x03030303UL)\
934
           + (d&0x03030303UL);\
935
        h1= ((c&0xFCFCFCFCUL)>>2)\
936
          + ((d&0xFCFCFCFCUL)>>2);\
937
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
938
        a= AV_RN32(&src1[i*src_stride1+4]);\
939
        b= AV_RN32(&src2[i*src_stride2+4]);\
940
        c= AV_RN32(&src3[i*src_stride3+4]);\
941
        d= AV_RN32(&src4[i*src_stride4+4]);\
942
        l0=  (a&0x03030303UL)\
943
           + (b&0x03030303UL)\
944
           + 0x01010101UL;\
945
        h0= ((a&0xFCFCFCFCUL)>>2)\
946
          + ((b&0xFCFCFCFCUL)>>2);\
947
        l1=  (c&0x03030303UL)\
948
           + (d&0x03030303UL);\
949
        h1= ((c&0xFCFCFCFCUL)>>2)\
950
          + ((d&0xFCFCFCFCUL)>>2);\
951
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
952
    }\
953
}\
954
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
955
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
956
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
957
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
958
}\
959
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
960
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
961
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
962
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
963
}\
964
\
965
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
966
{\
967
        int i, a0, b0, a1, b1;\
968
        a0= pixels[0];\
969
        b0= pixels[1] + 2;\
970
        a0 += b0;\
971
        b0 += pixels[2];\
972
\
973
        pixels+=line_size;\
974
        for(i=0; i<h; i+=2){\
975
            a1= pixels[0];\
976
            b1= pixels[1];\
977
            a1 += b1;\
978
            b1 += pixels[2];\
979
\
980
            block[0]= (a1+a0)>>2; /* FIXME non put */\
981
            block[1]= (b1+b0)>>2;\
982
\
983
            pixels+=line_size;\
984
            block +=line_size;\
985
\
986
            a0= pixels[0];\
987
            b0= pixels[1] + 2;\
988
            a0 += b0;\
989
            b0 += pixels[2];\
990
\
991
            block[0]= (a1+a0)>>2;\
992
            block[1]= (b1+b0)>>2;\
993
            pixels+=line_size;\
994
            block +=line_size;\
995
        }\
996
}\
997
\
998
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
999
{\
1000
        int i;\
1001
        const uint32_t a= AV_RN32(pixels  );\
1002
        const uint32_t b= AV_RN32(pixels+1);\
1003
        uint32_t l0=  (a&0x03030303UL)\
1004
                    + (b&0x03030303UL)\
1005
                    + 0x02020202UL;\
1006
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1007
                   + ((b&0xFCFCFCFCUL)>>2);\
1008
        uint32_t l1,h1;\
1009
\
1010
        pixels+=line_size;\
1011
        for(i=0; i<h; i+=2){\
1012
            uint32_t a= AV_RN32(pixels  );\
1013
            uint32_t b= AV_RN32(pixels+1);\
1014
            l1=  (a&0x03030303UL)\
1015
               + (b&0x03030303UL);\
1016
            h1= ((a&0xFCFCFCFCUL)>>2)\
1017
              + ((b&0xFCFCFCFCUL)>>2);\
1018
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1019
            pixels+=line_size;\
1020
            block +=line_size;\
1021
            a= AV_RN32(pixels  );\
1022
            b= AV_RN32(pixels+1);\
1023
            l0=  (a&0x03030303UL)\
1024
               + (b&0x03030303UL)\
1025
               + 0x02020202UL;\
1026
            h0= ((a&0xFCFCFCFCUL)>>2)\
1027
              + ((b&0xFCFCFCFCUL)>>2);\
1028
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1029
            pixels+=line_size;\
1030
            block +=line_size;\
1031
        }\
1032
}\
1033
\
1034
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1035
{\
1036
    int j;\
1037
    for(j=0; j<2; j++){\
1038
        int i;\
1039
        const uint32_t a= AV_RN32(pixels  );\
1040
        const uint32_t b= AV_RN32(pixels+1);\
1041
        uint32_t l0=  (a&0x03030303UL)\
1042
                    + (b&0x03030303UL)\
1043
                    + 0x02020202UL;\
1044
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1045
                   + ((b&0xFCFCFCFCUL)>>2);\
1046
        uint32_t l1,h1;\
1047
\
1048
        pixels+=line_size;\
1049
        for(i=0; i<h; i+=2){\
1050
            uint32_t a= AV_RN32(pixels  );\
1051
            uint32_t b= AV_RN32(pixels+1);\
1052
            l1=  (a&0x03030303UL)\
1053
               + (b&0x03030303UL);\
1054
            h1= ((a&0xFCFCFCFCUL)>>2)\
1055
              + ((b&0xFCFCFCFCUL)>>2);\
1056
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1057
            pixels+=line_size;\
1058
            block +=line_size;\
1059
            a= AV_RN32(pixels  );\
1060
            b= AV_RN32(pixels+1);\
1061
            l0=  (a&0x03030303UL)\
1062
               + (b&0x03030303UL)\
1063
               + 0x02020202UL;\
1064
            h0= ((a&0xFCFCFCFCUL)>>2)\
1065
              + ((b&0xFCFCFCFCUL)>>2);\
1066
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1067
            pixels+=line_size;\
1068
            block +=line_size;\
1069
        }\
1070
        pixels+=4-line_size*(h+1);\
1071
        block +=4-line_size*h;\
1072
    }\
1073
}\
1074
\
1075
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1076
{\
1077
    int j;\
1078
    for(j=0; j<2; j++){\
1079
        int i;\
1080
        const uint32_t a= AV_RN32(pixels  );\
1081
        const uint32_t b= AV_RN32(pixels+1);\
1082
        uint32_t l0=  (a&0x03030303UL)\
1083
                    + (b&0x03030303UL)\
1084
                    + 0x01010101UL;\
1085
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1086
                   + ((b&0xFCFCFCFCUL)>>2);\
1087
        uint32_t l1,h1;\
1088
\
1089
        pixels+=line_size;\
1090
        for(i=0; i<h; i+=2){\
1091
            uint32_t a= AV_RN32(pixels  );\
1092
            uint32_t b= AV_RN32(pixels+1);\
1093
            l1=  (a&0x03030303UL)\
1094
               + (b&0x03030303UL);\
1095
            h1= ((a&0xFCFCFCFCUL)>>2)\
1096
              + ((b&0xFCFCFCFCUL)>>2);\
1097
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1098
            pixels+=line_size;\
1099
            block +=line_size;\
1100
            a= AV_RN32(pixels  );\
1101
            b= AV_RN32(pixels+1);\
1102
            l0=  (a&0x03030303UL)\
1103
               + (b&0x03030303UL)\
1104
               + 0x01010101UL;\
1105
            h0= ((a&0xFCFCFCFCUL)>>2)\
1106
              + ((b&0xFCFCFCFCUL)>>2);\
1107
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1108
            pixels+=line_size;\
1109
            block +=line_size;\
1110
        }\
1111
        pixels+=4-line_size*(h+1);\
1112
        block +=4-line_size*h;\
1113
    }\
1114
}\
1115
\
1116
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1117
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1118
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1119
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1120
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1121
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1122
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1123
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1124

    
1125
#define op_avg(a, b) a = rnd_avg32(a, b)
1126
#endif
1127
#define op_put(a, b) a = b
1128

    
1129
PIXOP2(avg, op_avg)
1130
PIXOP2(put, op_put)
1131
#undef op_avg
1132
#undef op_put
1133

    
1134
#define avg2(a,b) ((a+b+1)>>1)
1135
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1136

    
1137
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1138
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1139
}
1140

    
1141
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1142
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1143
}
1144

    
1145
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1146
{
1147
    const int A=(16-x16)*(16-y16);
1148
    const int B=(   x16)*(16-y16);
1149
    const int C=(16-x16)*(   y16);
1150
    const int D=(   x16)*(   y16);
1151
    int i;
1152

    
1153
    for(i=0; i<h; i++)
1154
    {
1155
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1156
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1157
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1158
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1159
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1160
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1161
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1162
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1163
        dst+= stride;
1164
        src+= stride;
1165
    }
1166
}
1167

    
1168
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1169
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1170
{
1171
    int y, vx, vy;
1172
    const int s= 1<<shift;
1173

    
1174
    width--;
1175
    height--;
1176

    
1177
    for(y=0; y<h; y++){
1178
        int x;
1179

    
1180
        vx= ox;
1181
        vy= oy;
1182
        for(x=0; x<8; x++){ //XXX FIXME optimize
1183
            int src_x, src_y, frac_x, frac_y, index;
1184

    
1185
            src_x= vx>>16;
1186
            src_y= vy>>16;
1187
            frac_x= src_x&(s-1);
1188
            frac_y= src_y&(s-1);
1189
            src_x>>=shift;
1190
            src_y>>=shift;
1191

    
1192
            if((unsigned)src_x < width){
1193
                if((unsigned)src_y < height){
1194
                    index= src_x + src_y*stride;
1195
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1196
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1197
                                        + (  src[index+stride  ]*(s-frac_x)
1198
                                           + src[index+stride+1]*   frac_x )*   frac_y
1199
                                        + r)>>(shift*2);
1200
                }else{
1201
                    index= src_x + av_clip(src_y, 0, height)*stride;
1202
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1203
                                          + src[index       +1]*   frac_x )*s
1204
                                        + r)>>(shift*2);
1205
                }
1206
            }else{
1207
                if((unsigned)src_y < height){
1208
                    index= av_clip(src_x, 0, width) + src_y*stride;
1209
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1210
                                           + src[index+stride  ]*   frac_y )*s
1211
                                        + r)>>(shift*2);
1212
                }else{
1213
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1214
                    dst[y*stride + x]=    src[index         ];
1215
                }
1216
            }
1217

    
1218
            vx+= dxx;
1219
            vy+= dyx;
1220
        }
1221
        ox += dxy;
1222
        oy += dyy;
1223
    }
1224
}
1225

    
1226
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227
    switch(width){
1228
    case 2: put_pixels2_c (dst, src, stride, height); break;
1229
    case 4: put_pixels4_c (dst, src, stride, height); break;
1230
    case 8: put_pixels8_c (dst, src, stride, height); break;
1231
    case 16:put_pixels16_c(dst, src, stride, height); break;
1232
    }
1233
}
1234

    
1235
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236
    int i,j;
1237
    for (i=0; i < height; i++) {
1238
      for (j=0; j < width; j++) {
1239
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1240
      }
1241
      src += stride;
1242
      dst += stride;
1243
    }
1244
}
1245

    
1246
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247
    int i,j;
1248
    for (i=0; i < height; i++) {
1249
      for (j=0; j < width; j++) {
1250
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1251
      }
1252
      src += stride;
1253
      dst += stride;
1254
    }
1255
}
1256

    
1257
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258
    int i,j;
1259
    for (i=0; i < height; i++) {
1260
      for (j=0; j < width; j++) {
1261
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1262
      }
1263
      src += stride;
1264
      dst += stride;
1265
    }
1266
}
1267

    
1268
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269
    int i,j;
1270
    for (i=0; i < height; i++) {
1271
      for (j=0; j < width; j++) {
1272
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1273
      }
1274
      src += stride;
1275
      dst += stride;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    int i,j;
1281
    for (i=0; i < height; i++) {
1282
      for (j=0; j < width; j++) {
1283
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284
      }
1285
      src += stride;
1286
      dst += stride;
1287
    }
1288
}
1289

    
1290
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    int i,j;
1292
    for (i=0; i < height; i++) {
1293
      for (j=0; j < width; j++) {
1294
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1295
      }
1296
      src += stride;
1297
      dst += stride;
1298
    }
1299
}
1300

    
1301
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302
    int i,j;
1303
    for (i=0; i < height; i++) {
1304
      for (j=0; j < width; j++) {
1305
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1306
      }
1307
      src += stride;
1308
      dst += stride;
1309
    }
1310
}
1311

    
1312
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313
    int i,j;
1314
    for (i=0; i < height; i++) {
1315
      for (j=0; j < width; j++) {
1316
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1317
      }
1318
      src += stride;
1319
      dst += stride;
1320
    }
1321
}
1322

    
1323
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324
    switch(width){
1325
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1326
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1327
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1328
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1329
    }
1330
}
1331

    
1332
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    int i,j;
1378
    for (i=0; i < height; i++) {
1379
      for (j=0; j < width; j++) {
1380
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381
      }
1382
      src += stride;
1383
      dst += stride;
1384
    }
1385
}
1386

    
1387
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388
    int i,j;
1389
    for (i=0; i < height; i++) {
1390
      for (j=0; j < width; j++) {
1391
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1392
      }
1393
      src += stride;
1394
      dst += stride;
1395
    }
1396
}
1397

    
1398
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399
    int i,j;
1400
    for (i=0; i < height; i++) {
1401
      for (j=0; j < width; j++) {
1402
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1403
      }
1404
      src += stride;
1405
      dst += stride;
1406
    }
1407
}
1408

    
1409
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1410
    int i,j;
1411
    for (i=0; i < height; i++) {
1412
      for (j=0; j < width; j++) {
1413
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1414
      }
1415
      src += stride;
1416
      dst += stride;
1417
    }
1418
}
1419
#if 0
1420
#define TPEL_WIDTH(width)\
1421
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1423
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1425
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1427
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1429
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1431
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1432
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1433
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1434
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1435
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1436
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1437
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1438
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1439
#endif
1440

    
1441
#define H264_CHROMA_MC(OPNAME, OP)\
1442
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1443
    const int A=(8-x)*(8-y);\
1444
    const int B=(  x)*(8-y);\
1445
    const int C=(8-x)*(  y);\
1446
    const int D=(  x)*(  y);\
1447
    int i;\
1448
    \
1449
    assert(x<8 && y<8 && x>=0 && y>=0);\
1450
\
1451
    if(D){\
1452
        for(i=0; i<h; i++){\
1453
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1454
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1455
            dst+= stride;\
1456
            src+= stride;\
1457
        }\
1458
    }else{\
1459
        const int E= B+C;\
1460
        const int step= C ? stride : 1;\
1461
        for(i=0; i<h; i++){\
1462
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1463
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1464
            dst+= stride;\
1465
            src+= stride;\
1466
        }\
1467
    }\
1468
}\
1469
\
1470
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1471
    const int A=(8-x)*(8-y);\
1472
    const int B=(  x)*(8-y);\
1473
    const int C=(8-x)*(  y);\
1474
    const int D=(  x)*(  y);\
1475
    int i;\
1476
    \
1477
    assert(x<8 && y<8 && x>=0 && y>=0);\
1478
\
1479
    if(D){\
1480
        for(i=0; i<h; i++){\
1481
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1482
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1483
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1484
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1485
            dst+= stride;\
1486
            src+= stride;\
1487
        }\
1488
    }else{\
1489
        const int E= B+C;\
1490
        const int step= C ? stride : 1;\
1491
        for(i=0; i<h; i++){\
1492
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1493
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1494
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1495
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1496
            dst+= stride;\
1497
            src+= stride;\
1498
        }\
1499
    }\
1500
}\
1501
\
1502
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1503
    const int A=(8-x)*(8-y);\
1504
    const int B=(  x)*(8-y);\
1505
    const int C=(8-x)*(  y);\
1506
    const int D=(  x)*(  y);\
1507
    int i;\
1508
    \
1509
    assert(x<8 && y<8 && x>=0 && y>=0);\
1510
\
1511
    if(D){\
1512
        for(i=0; i<h; i++){\
1513
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1514
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1515
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1516
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1517
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1518
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1519
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1520
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1521
            dst+= stride;\
1522
            src+= stride;\
1523
        }\
1524
    }else{\
1525
        const int E= B+C;\
1526
        const int step= C ? stride : 1;\
1527
        for(i=0; i<h; i++){\
1528
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1529
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1530
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1531
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1532
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1533
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1534
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1535
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1536
            dst+= stride;\
1537
            src+= stride;\
1538
        }\
1539
    }\
1540
}
1541

    
1542
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1543
#define op_put(a, b) a = (((b) + 32)>>6)
1544

    
1545
H264_CHROMA_MC(put_       , op_put)
1546
H264_CHROMA_MC(avg_       , op_avg)
1547
#undef op_avg
1548
#undef op_put
1549

    
1550
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1551
    const int A=(8-x)*(8-y);
1552
    const int B=(  x)*(8-y);
1553
    const int C=(8-x)*(  y);
1554
    const int D=(  x)*(  y);
1555
    int i;
1556

    
1557
    assert(x<8 && y<8 && x>=0 && y>=0);
1558

    
1559
    for(i=0; i<h; i++)
1560
    {
1561
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1562
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1563
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1564
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1565
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1566
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1567
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1568
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1569
        dst+= stride;
1570
        src+= stride;
1571
    }
1572
}
1573

    
1574
#define QPEL_MC(r, OPNAME, RND, OP) \
1575
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1576
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1577
    int i;\
1578
    for(i=0; i<h; i++)\
1579
    {\
1580
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1581
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1582
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1583
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1584
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1585
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1586
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1587
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1588
        dst+=dstStride;\
1589
        src+=srcStride;\
1590
    }\
1591
}\
1592
\
1593
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1594
    const int w=8;\
1595
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1596
    int i;\
1597
    for(i=0; i<w; i++)\
1598
    {\
1599
        const int src0= src[0*srcStride];\
1600
        const int src1= src[1*srcStride];\
1601
        const int src2= src[2*srcStride];\
1602
        const int src3= src[3*srcStride];\
1603
        const int src4= src[4*srcStride];\
1604
        const int src5= src[5*srcStride];\
1605
        const int src6= src[6*srcStride];\
1606
        const int src7= src[7*srcStride];\
1607
        const int src8= src[8*srcStride];\
1608
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1609
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1610
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1611
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1612
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1613
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1614
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1615
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1616
        dst++;\
1617
        src++;\
1618
    }\
1619
}\
1620
\
1621
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1622
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1623
    int i;\
1624
    \
1625
    for(i=0; i<h; i++)\
1626
    {\
1627
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1628
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1629
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1630
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1631
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1632
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1633
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1634
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1635
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1636
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1637
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1638
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1639
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1640
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1641
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1642
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1643
        dst+=dstStride;\
1644
        src+=srcStride;\
1645
    }\
1646
}\
1647
\
1648
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1649
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1650
    int i;\
1651
    const int w=16;\
1652
    for(i=0; i<w; i++)\
1653
    {\
1654
        const int src0= src[0*srcStride];\
1655
        const int src1= src[1*srcStride];\
1656
        const int src2= src[2*srcStride];\
1657
        const int src3= src[3*srcStride];\
1658
        const int src4= src[4*srcStride];\
1659
        const int src5= src[5*srcStride];\
1660
        const int src6= src[6*srcStride];\
1661
        const int src7= src[7*srcStride];\
1662
        const int src8= src[8*srcStride];\
1663
        const int src9= src[9*srcStride];\
1664
        const int src10= src[10*srcStride];\
1665
        const int src11= src[11*srcStride];\
1666
        const int src12= src[12*srcStride];\
1667
        const int src13= src[13*srcStride];\
1668
        const int src14= src[14*srcStride];\
1669
        const int src15= src[15*srcStride];\
1670
        const int src16= src[16*srcStride];\
1671
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1672
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1673
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1674
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1675
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1676
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1677
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1678
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1679
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1680
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1681
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1682
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1683
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1684
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1685
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1686
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1687
        dst++;\
1688
        src++;\
1689
    }\
1690
}\
1691
\
1692
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1693
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1694
}\
1695
\
1696
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1697
    uint8_t half[64];\
1698
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1700
}\
1701
\
1702
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1703
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1704
}\
1705
\
1706
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1707
    uint8_t half[64];\
1708
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1709
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1710
}\
1711
\
1712
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[16*9];\
1714
    uint8_t half[64];\
1715
    copy_block9(full, src, 16, stride, 9);\
1716
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1717
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1718
}\
1719
\
1720
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1721
    uint8_t full[16*9];\
1722
    copy_block9(full, src, 16, stride, 9);\
1723
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1724
}\
1725
\
1726
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1727
    uint8_t full[16*9];\
1728
    uint8_t half[64];\
1729
    copy_block9(full, src, 16, stride, 9);\
1730
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1731
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1732
}\
1733
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1734
    uint8_t full[16*9];\
1735
    uint8_t halfH[72];\
1736
    uint8_t halfV[64];\
1737
    uint8_t halfHV[64];\
1738
    copy_block9(full, src, 16, stride, 9);\
1739
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1741
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1743
}\
1744
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1745
    uint8_t full[16*9];\
1746
    uint8_t halfH[72];\
1747
    uint8_t halfHV[64];\
1748
    copy_block9(full, src, 16, stride, 9);\
1749
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1750
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1751
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1752
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1753
}\
1754
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1755
    uint8_t full[16*9];\
1756
    uint8_t halfH[72];\
1757
    uint8_t halfV[64];\
1758
    uint8_t halfHV[64];\
1759
    copy_block9(full, src, 16, stride, 9);\
1760
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1762
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1764
}\
1765
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1766
    uint8_t full[16*9];\
1767
    uint8_t halfH[72];\
1768
    uint8_t halfHV[64];\
1769
    copy_block9(full, src, 16, stride, 9);\
1770
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1772
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1774
}\
1775
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1776
    uint8_t full[16*9];\
1777
    uint8_t halfH[72];\
1778
    uint8_t halfV[64];\
1779
    uint8_t halfHV[64];\
1780
    copy_block9(full, src, 16, stride, 9);\
1781
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1783
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1785
}\
1786
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1787
    uint8_t full[16*9];\
1788
    uint8_t halfH[72];\
1789
    uint8_t halfHV[64];\
1790
    copy_block9(full, src, 16, stride, 9);\
1791
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1793
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1794
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1795
}\
1796
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1797
    uint8_t full[16*9];\
1798
    uint8_t halfH[72];\
1799
    uint8_t halfV[64];\
1800
    uint8_t halfHV[64];\
1801
    copy_block9(full, src, 16, stride, 9);\
1802
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1803
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1804
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1806
}\
1807
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1808
    uint8_t full[16*9];\
1809
    uint8_t halfH[72];\
1810
    uint8_t halfHV[64];\
1811
    copy_block9(full, src, 16, stride, 9);\
1812
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1814
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1816
}\
1817
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1818
    uint8_t halfH[72];\
1819
    uint8_t halfHV[64];\
1820
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1821
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1823
}\
1824
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1825
    uint8_t halfH[72];\
1826
    uint8_t halfHV[64];\
1827
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1828
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1829
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1830
}\
1831
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1832
    uint8_t full[16*9];\
1833
    uint8_t halfH[72];\
1834
    uint8_t halfV[64];\
1835
    uint8_t halfHV[64];\
1836
    copy_block9(full, src, 16, stride, 9);\
1837
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1838
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1839
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1841
}\
1842
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1843
    uint8_t full[16*9];\
1844
    uint8_t halfH[72];\
1845
    copy_block9(full, src, 16, stride, 9);\
1846
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1848
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1849
}\
1850
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851
    uint8_t full[16*9];\
1852
    uint8_t halfH[72];\
1853
    uint8_t halfV[64];\
1854
    uint8_t halfHV[64];\
1855
    copy_block9(full, src, 16, stride, 9);\
1856
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1857
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1858
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1859
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1860
}\
1861
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1862
    uint8_t full[16*9];\
1863
    uint8_t halfH[72];\
1864
    copy_block9(full, src, 16, stride, 9);\
1865
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1867
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1868
}\
1869
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1870
    uint8_t halfH[72];\
1871
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1872
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1873
}\
1874
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1875
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1876
}\
1877
\
1878
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    uint8_t half[256];\
1880
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1882
}\
1883
\
1884
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1885
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1886
}\
1887
\
1888
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1889
    uint8_t half[256];\
1890
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1891
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1892
}\
1893
\
1894
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1895
    uint8_t full[24*17];\
1896
    uint8_t half[256];\
1897
    copy_block17(full, src, 24, stride, 17);\
1898
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1899
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1900
}\
1901
\
1902
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1903
    uint8_t full[24*17];\
1904
    copy_block17(full, src, 24, stride, 17);\
1905
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1906
}\
1907
\
1908
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1909
    uint8_t full[24*17];\
1910
    uint8_t half[256];\
1911
    copy_block17(full, src, 24, stride, 17);\
1912
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1913
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1914
}\
1915
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1916
    uint8_t full[24*17];\
1917
    uint8_t halfH[272];\
1918
    uint8_t halfV[256];\
1919
    uint8_t halfHV[256];\
1920
    copy_block17(full, src, 24, stride, 17);\
1921
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1923
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1925
}\
1926
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1927
    uint8_t full[24*17];\
1928
    uint8_t halfH[272];\
1929
    uint8_t halfHV[256];\
1930
    copy_block17(full, src, 24, stride, 17);\
1931
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1932
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1933
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1934
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1935
}\
1936
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1937
    uint8_t full[24*17];\
1938
    uint8_t halfH[272];\
1939
    uint8_t halfV[256];\
1940
    uint8_t halfHV[256];\
1941
    copy_block17(full, src, 24, stride, 17);\
1942
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1944
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1946
}\
1947
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1948
    uint8_t full[24*17];\
1949
    uint8_t halfH[272];\
1950
    uint8_t halfHV[256];\
1951
    copy_block17(full, src, 24, stride, 17);\
1952
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1953
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1954
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1956
}\
1957
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1958
    uint8_t full[24*17];\
1959
    uint8_t halfH[272];\
1960
    uint8_t halfV[256];\
1961
    uint8_t halfHV[256];\
1962
    copy_block17(full, src, 24, stride, 17);\
1963
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1965
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1967
}\
1968
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1969
    uint8_t full[24*17];\
1970
    uint8_t halfH[272];\
1971
    uint8_t halfHV[256];\
1972
    copy_block17(full, src, 24, stride, 17);\
1973
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1974
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1975
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1976
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1977
}\
1978
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979
    uint8_t full[24*17];\
1980
    uint8_t halfH[272];\
1981
    uint8_t halfV[256];\
1982
    uint8_t halfHV[256];\
1983
    copy_block17(full, src, 24, stride, 17);\
1984
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1985
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1986
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1988
}\
1989
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    uint8_t full[24*17];\
1991
    uint8_t halfH[272];\
1992
    uint8_t halfHV[256];\
1993
    copy_block17(full, src, 24, stride, 17);\
1994
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1995
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1996
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1998
}\
1999
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2000
    uint8_t halfH[272];\
2001
    uint8_t halfHV[256];\
2002
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2005
}\
2006
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t halfH[272];\
2008
    uint8_t halfHV[256];\
2009
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2010
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2011
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2012
}\
2013
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2014
    uint8_t full[24*17];\
2015
    uint8_t halfH[272];\
2016
    uint8_t halfV[256];\
2017
    uint8_t halfHV[256];\
2018
    copy_block17(full, src, 24, stride, 17);\
2019
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2020
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2021
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2022
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2023
}\
2024
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2025
    uint8_t full[24*17];\
2026
    uint8_t halfH[272];\
2027
    copy_block17(full, src, 24, stride, 17);\
2028
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2030
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2031
}\
2032
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[24*17];\
2034
    uint8_t halfH[272];\
2035
    uint8_t halfV[256];\
2036
    uint8_t halfHV[256];\
2037
    copy_block17(full, src, 24, stride, 17);\
2038
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2039
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2040
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2041
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2042
}\
2043
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2044
    uint8_t full[24*17];\
2045
    uint8_t halfH[272];\
2046
    copy_block17(full, src, 24, stride, 17);\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2049
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2050
}\
2051
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2052
    uint8_t halfH[272];\
2053
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2054
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2055
}
2056

    
2057
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2058
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2059
#define op_put(a, b) a = cm[((b) + 16)>>5]
2060
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2061

    
2062
QPEL_MC(0, put_       , _       , op_put)
2063
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2064
QPEL_MC(0, avg_       , _       , op_avg)
2065
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2066
#undef op_avg
2067
#undef op_avg_no_rnd
2068
#undef op_put
2069
#undef op_put_no_rnd
2070

    
2071
#if 1
2072
#define H264_LOWPASS(OPNAME, OP, OP2) \
2073
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2074
    const int h=2;\
2075
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2076
    int i;\
2077
    for(i=0; i<h; i++)\
2078
    {\
2079
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2080
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2081
        dst+=dstStride;\
2082
        src+=srcStride;\
2083
    }\
2084
}\
2085
\
2086
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2087
    const int w=2;\
2088
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2089
    int i;\
2090
    for(i=0; i<w; i++)\
2091
    {\
2092
        const int srcB= src[-2*srcStride];\
2093
        const int srcA= src[-1*srcStride];\
2094
        const int src0= src[0 *srcStride];\
2095
        const int src1= src[1 *srcStride];\
2096
        const int src2= src[2 *srcStride];\
2097
        const int src3= src[3 *srcStride];\
2098
        const int src4= src[4 *srcStride];\
2099
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2100
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2101
        dst++;\
2102
        src++;\
2103
    }\
2104
}\
2105
\
2106
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2107
    const int h=2;\
2108
    const int w=2;\
2109
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2110
    int i;\
2111
    src -= 2*srcStride;\
2112
    for(i=0; i<h+5; i++)\
2113
    {\
2114
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2115
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2116
        tmp+=tmpStride;\
2117
        src+=srcStride;\
2118
    }\
2119
    tmp -= tmpStride*(h+5-2);\
2120
    for(i=0; i<w; i++)\
2121
    {\
2122
        const int tmpB= tmp[-2*tmpStride];\
2123
        const int tmpA= tmp[-1*tmpStride];\
2124
        const int tmp0= tmp[0 *tmpStride];\
2125
        const int tmp1= tmp[1 *tmpStride];\
2126
        const int tmp2= tmp[2 *tmpStride];\
2127
        const int tmp3= tmp[3 *tmpStride];\
2128
        const int tmp4= tmp[4 *tmpStride];\
2129
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2130
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2131
        dst++;\
2132
        tmp++;\
2133
    }\
2134
}\
2135
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2136
    const int h=4;\
2137
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2138
    int i;\
2139
    for(i=0; i<h; i++)\
2140
    {\
2141
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2142
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2143
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2144
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2145
        dst+=dstStride;\
2146
        src+=srcStride;\
2147
    }\
2148
}\
2149
\
2150
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2151
    const int w=4;\
2152
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2153
    int i;\
2154
    for(i=0; i<w; i++)\
2155
    {\
2156
        const int srcB= src[-2*srcStride];\
2157
        const int srcA= src[-1*srcStride];\
2158
        const int src0= src[0 *srcStride];\
2159
        const int src1= src[1 *srcStride];\
2160
        const int src2= src[2 *srcStride];\
2161
        const int src3= src[3 *srcStride];\
2162
        const int src4= src[4 *srcStride];\
2163
        const int src5= src[5 *srcStride];\
2164
        const int src6= src[6 *srcStride];\
2165
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2166
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2167
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2168
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2169
        dst++;\
2170
        src++;\
2171
    }\
2172
}\
2173
\
2174
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2175
    const int h=4;\
2176
    const int w=4;\
2177
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2178
    int i;\
2179
    src -= 2*srcStride;\
2180
    for(i=0; i<h+5; i++)\
2181
    {\
2182
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2183
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2184
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2185
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2186
        tmp+=tmpStride;\
2187
        src+=srcStride;\
2188
    }\
2189
    tmp -= tmpStride*(h+5-2);\
2190
    for(i=0; i<w; i++)\
2191
    {\
2192
        const int tmpB= tmp[-2*tmpStride];\
2193
        const int tmpA= tmp[-1*tmpStride];\
2194
        const int tmp0= tmp[0 *tmpStride];\
2195
        const int tmp1= tmp[1 *tmpStride];\
2196
        const int tmp2= tmp[2 *tmpStride];\
2197
        const int tmp3= tmp[3 *tmpStride];\
2198
        const int tmp4= tmp[4 *tmpStride];\
2199
        const int tmp5= tmp[5 *tmpStride];\
2200
        const int tmp6= tmp[6 *tmpStride];\
2201
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2202
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2203
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2204
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2205
        dst++;\
2206
        tmp++;\
2207
    }\
2208
}\
2209
\
2210
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2211
    const int h=8;\
2212
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2213
    int i;\
2214
    for(i=0; i<h; i++)\
2215
    {\
2216
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2217
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2218
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2219
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2220
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2221
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2222
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2223
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2224
        dst+=dstStride;\
2225
        src+=srcStride;\
2226
    }\
2227
}\
2228
\
2229
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230
    const int w=8;\
2231
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2232
    int i;\
2233
    for(i=0; i<w; i++)\
2234
    {\
2235
        const int srcB= src[-2*srcStride];\
2236
        const int srcA= src[-1*srcStride];\
2237
        const int src0= src[0 *srcStride];\
2238
        const int src1= src[1 *srcStride];\
2239
        const int src2= src[2 *srcStride];\
2240
        const int src3= src[3 *srcStride];\
2241
        const int src4= src[4 *srcStride];\
2242
        const int src5= src[5 *srcStride];\
2243
        const int src6= src[6 *srcStride];\
2244
        const int src7= src[7 *srcStride];\
2245
        const int src8= src[8 *srcStride];\
2246
        const int src9= src[9 *srcStride];\
2247
        const int src10=src[10*srcStride];\
2248
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2249
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2250
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2251
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2252
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2253
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2254
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2255
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2256
        dst++;\
2257
        src++;\
2258
    }\
2259
}\
2260
\
2261
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2262
    const int h=8;\
2263
    const int w=8;\
2264
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2265
    int i;\
2266
    src -= 2*srcStride;\
2267
    for(i=0; i<h+5; i++)\
2268
    {\
2269
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2270
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2271
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2272
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2273
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2274
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2275
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2276
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2277
        tmp+=tmpStride;\
2278
        src+=srcStride;\
2279
    }\
2280
    tmp -= tmpStride*(h+5-2);\
2281
    for(i=0; i<w; i++)\
2282
    {\
2283
        const int tmpB= tmp[-2*tmpStride];\
2284
        const int tmpA= tmp[-1*tmpStride];\
2285
        const int tmp0= tmp[0 *tmpStride];\
2286
        const int tmp1= tmp[1 *tmpStride];\
2287
        const int tmp2= tmp[2 *tmpStride];\
2288
        const int tmp3= tmp[3 *tmpStride];\
2289
        const int tmp4= tmp[4 *tmpStride];\
2290
        const int tmp5= tmp[5 *tmpStride];\
2291
        const int tmp6= tmp[6 *tmpStride];\
2292
        const int tmp7= tmp[7 *tmpStride];\
2293
        const int tmp8= tmp[8 *tmpStride];\
2294
        const int tmp9= tmp[9 *tmpStride];\
2295
        const int tmp10=tmp[10*tmpStride];\
2296
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2297
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2298
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2299
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2300
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2301
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2302
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2303
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2304
        dst++;\
2305
        tmp++;\
2306
    }\
2307
}\
2308
\
2309
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2310
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2311
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2312
    src += 8*srcStride;\
2313
    dst += 8*dstStride;\
2314
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2315
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2316
}\
2317
\
2318
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2319
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2320
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2321
    src += 8*srcStride;\
2322
    dst += 8*dstStride;\
2323
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2324
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2325
}\
2326
\
2327
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2328
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2329
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2330
    src += 8*srcStride;\
2331
    dst += 8*dstStride;\
2332
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2333
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2334
}\
2335

    
2336
#define H264_MC(OPNAME, SIZE) \
2337
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2338
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2339
}\
2340
\
2341
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2342
    uint8_t half[SIZE*SIZE];\
2343
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2345
}\
2346
\
2347
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2348
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2349
}\
2350
\
2351
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2352
    uint8_t half[SIZE*SIZE];\
2353
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2354
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2355
}\
2356
\
2357
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2358
    uint8_t full[SIZE*(SIZE+5)];\
2359
    uint8_t * const full_mid= full + SIZE*2;\
2360
    uint8_t half[SIZE*SIZE];\
2361
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2362
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2363
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2367
    uint8_t full[SIZE*(SIZE+5)];\
2368
    uint8_t * const full_mid= full + SIZE*2;\
2369
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2370
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2371
}\
2372
\
2373
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2374
    uint8_t full[SIZE*(SIZE+5)];\
2375
    uint8_t * const full_mid= full + SIZE*2;\
2376
    uint8_t half[SIZE*SIZE];\
2377
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2378
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2379
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2380
}\
2381
\
2382
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2383
    uint8_t full[SIZE*(SIZE+5)];\
2384
    uint8_t * const full_mid= full + SIZE*2;\
2385
    uint8_t halfH[SIZE*SIZE];\
2386
    uint8_t halfV[SIZE*SIZE];\
2387
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2388
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2389
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2390
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2391
}\
2392
\
2393
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2394
    uint8_t full[SIZE*(SIZE+5)];\
2395
    uint8_t * const full_mid= full + SIZE*2;\
2396
    uint8_t halfH[SIZE*SIZE];\
2397
    uint8_t halfV[SIZE*SIZE];\
2398
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2399
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2400
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2401
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2402
}\
2403
\
2404
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2405
    uint8_t full[SIZE*(SIZE+5)];\
2406
    uint8_t * const full_mid= full + SIZE*2;\
2407
    uint8_t halfH[SIZE*SIZE];\
2408
    uint8_t halfV[SIZE*SIZE];\
2409
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2410
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2411
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2412
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2413
}\
2414
\
2415
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2416
    uint8_t full[SIZE*(SIZE+5)];\
2417
    uint8_t * const full_mid= full + SIZE*2;\
2418
    uint8_t halfH[SIZE*SIZE];\
2419
    uint8_t halfV[SIZE*SIZE];\
2420
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2421
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2422
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2423
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2424
}\
2425
\
2426
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2427
    int16_t tmp[SIZE*(SIZE+5)];\
2428
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2429
}\
2430
\
2431
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2432
    int16_t tmp[SIZE*(SIZE+5)];\
2433
    uint8_t halfH[SIZE*SIZE];\
2434
    uint8_t halfHV[SIZE*SIZE];\
2435
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2436
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2437
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2438
}\
2439
\
2440
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2441
    int16_t tmp[SIZE*(SIZE+5)];\
2442
    uint8_t halfH[SIZE*SIZE];\
2443
    uint8_t halfHV[SIZE*SIZE];\
2444
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2445
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2446
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2447
}\
2448
\
2449
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2450
    uint8_t full[SIZE*(SIZE+5)];\
2451
    uint8_t * const full_mid= full + SIZE*2;\
2452
    int16_t tmp[SIZE*(SIZE+5)];\
2453
    uint8_t halfV[SIZE*SIZE];\
2454
    uint8_t halfHV[SIZE*SIZE];\
2455
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2456
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2457
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2458
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2459
}\
2460
\
2461
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2462
    uint8_t full[SIZE*(SIZE+5)];\
2463
    uint8_t * const full_mid= full + SIZE*2;\
2464
    int16_t tmp[SIZE*(SIZE+5)];\
2465
    uint8_t halfV[SIZE*SIZE];\
2466
    uint8_t halfHV[SIZE*SIZE];\
2467
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2468
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2469
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2470
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2471
}\
2472

    
2473
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2474
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2475
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2476
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2477
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2478

    
2479
H264_LOWPASS(put_       , op_put, op2_put)
2480
H264_LOWPASS(avg_       , op_avg, op2_avg)
2481
H264_MC(put_, 2)
2482
H264_MC(put_, 4)
2483
H264_MC(put_, 8)
2484
H264_MC(put_, 16)
2485
H264_MC(avg_, 4)
2486
H264_MC(avg_, 8)
2487
H264_MC(avg_, 16)
2488

    
2489
#undef op_avg
2490
#undef op_put
2491
#undef op2_avg
2492
#undef op2_put
2493
#endif
2494

    
2495
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2496
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2497
#define H264_WEIGHT(W,H) \
2498
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2499
    int y; \
2500
    offset <<= log2_denom; \
2501
    if(log2_denom) offset += 1<<(log2_denom-1); \
2502
    for(y=0; y<H; y++, block += stride){ \
2503
        op_scale1(0); \
2504
        op_scale1(1); \
2505
        if(W==2) continue; \
2506
        op_scale1(2); \
2507
        op_scale1(3); \
2508
        if(W==4) continue; \
2509
        op_scale1(4); \
2510
        op_scale1(5); \
2511
        op_scale1(6); \
2512
        op_scale1(7); \
2513
        if(W==8) continue; \
2514
        op_scale1(8); \
2515
        op_scale1(9); \
2516
        op_scale1(10); \
2517
        op_scale1(11); \
2518
        op_scale1(12); \
2519
        op_scale1(13); \
2520
        op_scale1(14); \
2521
        op_scale1(15); \
2522
    } \
2523
} \
2524
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2525
    int y; \
2526
    offset = ((offset + 1) | 1) << log2_denom; \
2527
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2528
        op_scale2(0); \
2529
        op_scale2(1); \
2530
        if(W==2) continue; \
2531
        op_scale2(2); \
2532
        op_scale2(3); \
2533
        if(W==4) continue; \
2534
        op_scale2(4); \
2535
        op_scale2(5); \
2536
        op_scale2(6); \
2537
        op_scale2(7); \
2538
        if(W==8) continue; \
2539
        op_scale2(8); \
2540
        op_scale2(9); \
2541
        op_scale2(10); \
2542
        op_scale2(11); \
2543
        op_scale2(12); \
2544
        op_scale2(13); \
2545
        op_scale2(14); \
2546
        op_scale2(15); \
2547
    } \
2548
}
2549

    
2550
H264_WEIGHT(16,16)
2551
H264_WEIGHT(16,8)
2552
H264_WEIGHT(8,16)
2553
H264_WEIGHT(8,8)
2554
H264_WEIGHT(8,4)
2555
H264_WEIGHT(4,8)
2556
H264_WEIGHT(4,4)
2557
H264_WEIGHT(4,2)
2558
H264_WEIGHT(2,4)
2559
H264_WEIGHT(2,2)
2560

    
2561
#undef op_scale1
2562
#undef op_scale2
2563
#undef H264_WEIGHT
2564

    
2565
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2566
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2567
    int i;
2568

    
2569
    for(i=0; i<h; i++){
2570
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2571
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2572
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2573
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2574
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2575
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2576
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2577
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2578
        dst+=dstStride;
2579
        src+=srcStride;
2580
    }
2581
}
2582

    
2583
#ifdef CONFIG_CAVS_DECODER
2584
/* AVS specific */
2585
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2586

    
2587
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2588
    put_pixels8_c(dst, src, stride, 8);
2589
}
2590
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2591
    avg_pixels8_c(dst, src, stride, 8);
2592
}
2593
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2594
    put_pixels16_c(dst, src, stride, 16);
2595
}
2596
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2597
    avg_pixels16_c(dst, src, stride, 16);
2598
}
2599
#endif /* CONFIG_CAVS_DECODER */
2600

    
2601
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2602
/* VC-1 specific */
2603
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2604

    
2605
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2606
    put_pixels8_c(dst, src, stride, 8);
2607
}
2608
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2609

    
2610
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2611

    
2612
/* H264 specific */
2613
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2614

    
2615
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2616
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2617
    int i;
2618

    
2619
    for(i=0; i<w; i++){
2620
        const int src_1= src[ -srcStride];
2621
        const int src0 = src[0          ];
2622
        const int src1 = src[  srcStride];
2623
        const int src2 = src[2*srcStride];
2624
        const int src3 = src[3*srcStride];
2625
        const int src4 = src[4*srcStride];
2626
        const int src5 = src[5*srcStride];
2627
        const int src6 = src[6*srcStride];
2628
        const int src7 = src[7*srcStride];
2629
        const int src8 = src[8*srcStride];
2630
        const int src9 = src[9*srcStride];
2631
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2632
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2633
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2634
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2635
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2636
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2637
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2638
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2639
        src++;
2640
        dst++;
2641
    }
2642
}
2643

    
2644
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2645
    put_pixels8_c(dst, src, stride, 8);
2646
}
2647

    
2648
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2649
    uint8_t half[64];
2650
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2651
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2652
}
2653

    
2654
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2655
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2656
}
2657

    
2658
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2659
    uint8_t half[64];
2660
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2661
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2662
}
2663

    
2664
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2665
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2666
}
2667

    
2668
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2669
    uint8_t halfH[88];
2670
    uint8_t halfV[64];
2671
    uint8_t halfHV[64];
2672
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2673
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2674
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2675
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2676
}
2677
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2678
    uint8_t halfH[88];
2679
    uint8_t halfV[64];
2680
    uint8_t halfHV[64];
2681
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2682
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2683
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2684
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2685
}
2686
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2687
    uint8_t halfH[88];
2688
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2689
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2690
}
2691

    
2692
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2693
    if(ENABLE_ANY_H263) {
2694
    int x;
2695
    const int strength= ff_h263_loop_filter_strength[qscale];
2696

    
2697
    for(x=0; x<8; x++){
2698
        int d1, d2, ad1;
2699
        int p0= src[x-2*stride];
2700
        int p1= src[x-1*stride];
2701
        int p2= src[x+0*stride];
2702
        int p3= src[x+1*stride];
2703
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2704

    
2705
        if     (d<-2*strength) d1= 0;
2706
        else if(d<-  strength) d1=-2*strength - d;
2707
        else if(d<   strength) d1= d;
2708
        else if(d< 2*strength) d1= 2*strength - d;
2709
        else                   d1= 0;
2710

    
2711
        p1 += d1;
2712
        p2 -= d1;
2713
        if(p1&256) p1= ~(p1>>31);
2714
        if(p2&256) p2= ~(p2>>31);
2715

    
2716
        src[x-1*stride] = p1;
2717
        src[x+0*stride] = p2;
2718

    
2719
        ad1= FFABS(d1)>>1;
2720

    
2721
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2722

    
2723
        src[x-2*stride] = p0 - d2;
2724
        src[x+  stride] = p3 + d2;
2725
    }
2726
    }
2727
}
2728

    
2729
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2730
    if(ENABLE_ANY_H263) {
2731
    int y;
2732
    const int strength= ff_h263_loop_filter_strength[qscale];
2733

    
2734
    for(y=0; y<8; y++){
2735
        int d1, d2, ad1;
2736
        int p0= src[y*stride-2];
2737
        int p1= src[y*stride-1];
2738
        int p2= src[y*stride+0];
2739
        int p3= src[y*stride+1];
2740
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2741

    
2742
        if     (d<-2*strength) d1= 0;
2743
        else if(d<-  strength) d1=-2*strength - d;
2744
        else if(d<   strength) d1= d;
2745
        else if(d< 2*strength) d1= 2*strength - d;
2746
        else                   d1= 0;
2747

    
2748
        p1 += d1;
2749
        p2 -= d1;
2750
        if(p1&256) p1= ~(p1>>31);
2751
        if(p2&256) p2= ~(p2>>31);
2752

    
2753
        src[y*stride-1] = p1;
2754
        src[y*stride+0] = p2;
2755

    
2756
        ad1= FFABS(d1)>>1;
2757

    
2758
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2759

    
2760
        src[y*stride-2] = p0 - d2;
2761
        src[y*stride+1] = p3 + d2;
2762
    }
2763
    }
2764
}
2765

    
2766
static void h261_loop_filter_c(uint8_t *src, int stride){
2767
    int x,y,xy,yz;
2768
    int temp[64];
2769

    
2770
    for(x=0; x<8; x++){
2771
        temp[x      ] = 4*src[x           ];
2772
        temp[x + 7*8] = 4*src[x + 7*stride];
2773
    }
2774
    for(y=1; y<7; y++){
2775
        for(x=0; x<8; x++){
2776
            xy = y * stride + x;
2777
            yz = y * 8 + x;
2778
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2779
        }
2780
    }
2781

    
2782
    for(y=0; y<8; y++){
2783
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2784
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2785
        for(x=1; x<7; x++){
2786
            xy = y * stride + x;
2787
            yz = y * 8 + x;
2788
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2789
        }
2790
    }
2791
}
2792

    
2793
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2794
{
2795
    int i, d;
2796
    for( i = 0; i < 4; i++ ) {
2797
        if( tc0[i] < 0 ) {
2798
            pix += 4*ystride;
2799
            continue;
2800
        }
2801
        for( d = 0; d < 4; d++ ) {
2802
            const int p0 = pix[-1*xstride];
2803
            const int p1 = pix[-2*xstride];
2804
            const int p2 = pix[-3*xstride];
2805
            const int q0 = pix[0];
2806
            const int q1 = pix[1*xstride];
2807
            const int q2 = pix[2*xstride];
2808

    
2809
            if( FFABS( p0 - q0 ) < alpha &&
2810
                FFABS( p1 - p0 ) < beta &&
2811
                FFABS( q1 - q0 ) < beta ) {
2812

    
2813
                int tc = tc0[i];
2814
                int i_delta;
2815

    
2816
                if( FFABS( p2 - p0 ) < beta ) {
2817
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2818
                    tc++;
2819
                }
2820
                if( FFABS( q2 - q0 ) < beta ) {
2821
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2822
                    tc++;
2823
                }
2824

    
2825
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2826
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2827
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2828
            }
2829
            pix += ystride;
2830
        }
2831
    }
2832
}
2833
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2834
{
2835
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2836
}
2837
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2838
{
2839
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2840
}
2841

    
2842
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2843
{
2844
    int i, d;
2845
    for( i = 0; i < 4; i++ ) {
2846
        const int tc = tc0[i];
2847
        if( tc <= 0 ) {
2848
            pix += 2*ystride;
2849
            continue;
2850
        }
2851
        for( d = 0; d < 2; d++ ) {
2852
            const int p0 = pix[-1*xstride];
2853
            const int p1 = pix[-2*xstride];
2854
            const int q0 = pix[0];
2855
            const int q1 = pix[1*xstride];
2856

    
2857
            if( FFABS( p0 - q0 ) < alpha &&
2858
                FFABS( p1 - p0 ) < beta &&
2859
                FFABS( q1 - q0 ) < beta ) {
2860

    
2861
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2862

    
2863
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2864
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2865
            }
2866
            pix += ystride;
2867
        }
2868
    }
2869
}
2870
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2871
{
2872
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2873
}
2874
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2875
{
2876
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2877
}
2878

    
2879
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2880
{
2881
    int d;
2882
    for( d = 0; d < 8; d++ ) {
2883
        const int p0 = pix[-1*xstride];
2884
        const int p1 = pix[-2*xstride];
2885
        const int q0 = pix[0];
2886
        const int q1 = pix[1*xstride];
2887

    
2888
        if( FFABS( p0 - q0 ) < alpha &&
2889
            FFABS( p1 - p0 ) < beta &&
2890
            FFABS( q1 - q0 ) < beta ) {
2891

    
2892
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2893
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2894
        }
2895
        pix += ystride;
2896
    }
2897
}
2898
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2899
{
2900
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2901
}
2902
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2903
{
2904
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2905
}
2906

    
2907
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908
{
2909
    int s, i;
2910

    
2911
    s = 0;
2912
    for(i=0;i<h;i++) {
2913
        s += abs(pix1[0] - pix2[0]);
2914
        s += abs(pix1[1] - pix2[1]);
2915
        s += abs(pix1[2] - pix2[2]);
2916
        s += abs(pix1[3] - pix2[3]);
2917
        s += abs(pix1[4] - pix2[4]);
2918
        s += abs(pix1[5] - pix2[5]);
2919
        s += abs(pix1[6] - pix2[6]);
2920
        s += abs(pix1[7] - pix2[7]);
2921
        s += abs(pix1[8] - pix2[8]);
2922
        s += abs(pix1[9] - pix2[9]);
2923
        s += abs(pix1[10] - pix2[10]);
2924
        s += abs(pix1[11] - pix2[11]);
2925
        s += abs(pix1[12] - pix2[12]);
2926
        s += abs(pix1[13] - pix2[13]);
2927
        s += abs(pix1[14] - pix2[14]);
2928
        s += abs(pix1[15] - pix2[15]);
2929
        pix1 += line_size;
2930
        pix2 += line_size;
2931
    }
2932
    return s;
2933
}
2934

    
2935
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2936
{
2937
    int s, i;
2938

    
2939
    s = 0;
2940
    for(i=0;i<h;i++) {
2941
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2942
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2943
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2944
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2945
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2946
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2947
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2948
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2949
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2950
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2951
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2952
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2953
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2954
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2955
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2956
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2957
        pix1 += line_size;
2958
        pix2 += line_size;
2959
    }
2960
    return s;
2961
}
2962

    
2963
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2964
{
2965
    int s, i;
2966
    uint8_t *pix3 = pix2 + line_size;
2967

    
2968
    s = 0;
2969
    for(i=0;i<h;i++) {
2970
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2971
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2972
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2973
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2974
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2975
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2976
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2977
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2978
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2979
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2980
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2981
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2982
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2983
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2984
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2985
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2986
        pix1 += line_size;
2987
        pix2 += line_size;
2988
        pix3 += line_size;
2989
    }
2990
    return s;
2991
}
2992

    
2993
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994
{
2995
    int s, i;
2996
    uint8_t *pix3 = pix2 + line_size;
2997

    
2998
    s = 0;
2999
    for(i=0;i<h;i++) {
3000
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3001
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3002
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3003
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3004
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3005
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3006
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3007
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3008
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3009
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3010
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3011
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3012
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3013
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3014
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3015
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3016
        pix1 += line_size;
3017
        pix2 += line_size;
3018
        pix3 += line_size;
3019
    }
3020
    return s;
3021
}
3022

    
3023
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3024
{
3025
    int s, i;
3026

    
3027
    s = 0;
3028
    for(i=0;i<h;i++) {
3029
        s += abs(pix1[0] - pix2[0]);
3030
        s += abs(pix1[1] - pix2[1]);
3031
        s += abs(pix1[2] - pix2[2]);
3032
        s += abs(pix1[3] - pix2[3]);
3033
        s += abs(pix1[4] - pix2[4]);
3034
        s += abs(pix1[5] - pix2[5]);
3035
        s += abs(pix1[6] - pix2[6]);
3036
        s += abs(pix1[7] - pix2[7]);
3037
        pix1 += line_size;
3038
        pix2 += line_size;
3039
    }
3040
    return s;
3041
}
3042

    
3043
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3044
{
3045
    int s, i;
3046

    
3047
    s = 0;
3048
    for(i=0;i<h;i++) {
3049
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3050
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3051
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3052
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3053
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3054
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3055
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3056
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3057
        pix1 += line_size;
3058
        pix2 += line_size;
3059
    }
3060
    return s;
3061
}
3062

    
3063
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3064
{
3065
    int s, i;
3066
    uint8_t *pix3 = pix2 + line_size;
3067

    
3068
    s = 0;
3069
    for(i=0;i<h;i++) {
3070
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3071
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3072
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3073
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3074
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3075
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3076
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3077
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3078
        pix1 += line_size;
3079
        pix2 += line_size;
3080
        pix3 += line_size;
3081
    }
3082
    return s;
3083
}
3084

    
3085
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3086
{
3087
    int s, i;
3088
    uint8_t *pix3 = pix2 + line_size;
3089

    
3090
    s = 0;
3091
    for(i=0;i<h;i++) {
3092
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3093
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3094
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3095
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3096
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3097
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3098
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3099
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3100
        pix1 += line_size;
3101
        pix2 += line_size;
3102
        pix3 += line_size;
3103
    }
3104
    return s;
3105
}
3106

    
3107
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3108
    MpegEncContext *c = v;
3109
    int score1=0;
3110
    int score2=0;
3111
    int x,y;
3112

    
3113
    for(y=0; y<h; y++){
3114
        for(x=0; x<16; x++){
3115
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3116
        }
3117
        if(y+1<h){
3118
            for(x=0; x<15; x++){
3119
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3120
                             - s1[x+1] + s1[x+1+stride])
3121
                        -FFABS(  s2[x  ] - s2[x  +stride]
3122
                             - s2[x+1] + s2[x+1+stride]);
3123
            }
3124
        }
3125
        s1+= stride;
3126
        s2+= stride;
3127
    }
3128

    
3129
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3130
    else  return score1 + FFABS(score2)*8;
3131
}
3132

    
3133
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3134
    MpegEncContext *c = v;
3135
    int score1=0;
3136
    int score2=0;
3137
    int x,y;
3138

    
3139
    for(y=0; y<h; y++){
3140
        for(x=0; x<8; x++){
3141
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3142
        }
3143
        if(y+1<h){
3144
            for(x=0; x<7; x++){
3145
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3146
                             - s1[x+1] + s1[x+1+stride])
3147
                        -FFABS(  s2[x  ] - s2[x  +stride]
3148
                             - s2[x+1] + s2[x+1+stride]);
3149
            }
3150
        }
3151
        s1+= stride;
3152
        s2+= stride;
3153
    }
3154

    
3155
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3156
    else  return score1 + FFABS(score2)*8;
3157
}
3158

    
3159
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3160
    int i;
3161
    unsigned int sum=0;
3162

    
3163
    for(i=0; i<8*8; i++){
3164
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3165
        int w= weight[i];
3166
        b>>= RECON_SHIFT;
3167
        assert(-512<b && b<512);
3168

    
3169
        sum += (w*b)*(w*b)>>4;
3170
    }
3171
    return sum>>2;
3172
}
3173

    
3174
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3175
    int i;
3176

    
3177
    for(i=0; i<8*8; i++){
3178
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3179
    }
3180
}
3181

    
3182
/**
3183
 * permutes an 8x8 block.
3184
 * @param block the block which will be permuted according to the given permutation vector
3185
 * @param permutation the permutation vector
3186
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3187
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3188
 *                  (inverse) permutated to scantable order!
3189
 */
3190
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3191
{
3192
    int i;
3193
    DCTELEM temp[64];
3194

    
3195
    if(last<=0) return;
3196
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3197

    
3198
    for(i=0; i<=last; i++){
3199
        const int j= scantable[i];
3200
        temp[j]= block[j];
3201
        block[j]=0;
3202
    }
3203

    
3204
    for(i=0; i<=last; i++){
3205
        const int j= scantable[i];
3206
        const int perm_j= permutation[j];
3207
        block[perm_j]= temp[j];
3208
    }
3209
}
3210

    
3211
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3212
    return 0;
3213
}
3214

    
3215
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3216
    int i;
3217

    
3218
    memset(cmp, 0, sizeof(void*)*5);
3219

    
3220
    for(i=0; i<5; i++){
3221
        switch(type&0xFF){
3222
        case FF_CMP_SAD:
3223
            cmp[i]= c->sad[i];
3224
            break;
3225
        case FF_CMP_SATD:
3226
            cmp[i]= c->hadamard8_diff[i];
3227
            break;
3228
        case FF_CMP_SSE:
3229
            cmp[i]= c->sse[i];
3230
            break;
3231
        case FF_CMP_DCT:
3232
            cmp[i]= c->dct_sad[i];
3233
            break;
3234
        case FF_CMP_DCT264:
3235
            cmp[i]= c->dct264_sad[i];
3236
            break;
3237
        case FF_CMP_DCTMAX:
3238
            cmp[i]= c->dct_max[i];
3239
            break;
3240
        case FF_CMP_PSNR:
3241
            cmp[i]= c->quant_psnr[i];
3242
            break;
3243
        case FF_CMP_BIT:
3244
            cmp[i]= c->bit[i];
3245
            break;
3246
        case FF_CMP_RD:
3247
            cmp[i]= c->rd[i];
3248
            break;
3249
        case FF_CMP_VSAD:
3250
            cmp[i]= c->vsad[i];
3251
            break;
3252
        case FF_CMP_VSSE:
3253
            cmp[i]= c->vsse[i];
3254
            break;
3255
        case FF_CMP_ZERO:
3256
            cmp[i]= zero_cmp;
3257
            break;
3258
        case FF_CMP_NSSE:
3259
            cmp[i]= c->nsse[i];
3260
            break;
3261
#ifdef CONFIG_SNOW_ENCODER
3262
        case FF_CMP_W53:
3263
            cmp[i]= c->w53[i];
3264
            break;
3265
        case FF_CMP_W97:
3266
            cmp[i]= c->w97[i];
3267
            break;
3268
#endif
3269
        default:
3270
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3271
        }
3272
    }
3273
}
3274

    
3275
/**
3276
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3277
 */
3278
static void clear_blocks_c(DCTELEM *blocks)
3279
{
3280
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3281
}
3282

    
3283
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3284
    long i;
3285
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3286
        long a = *(long*)(src+i);
3287
        long b = *(long*)(dst+i);
3288
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3289
    }
3290
    for(; i<w; i++)
3291
        dst[i+0] += src[i+0];
3292
}
3293

    
3294
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3295
    long i;
3296
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3297
        long a = *(long*)(src1+i);
3298
        long b = *(long*)(src2+i);
3299
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3300
    }
3301
    for(; i<w; i++)
3302
        dst[i] = src1[i]+src2[i];
3303
}
3304

    
3305
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3306
    long i;
3307
#ifndef HAVE_FAST_UNALIGNED
3308
    if((long)src2 & (sizeof(long)-1)){
3309
        for(i=0; i+7<w; i+=8){
3310
            dst[i+0] = src1[i+0]-src2[i+0];
3311
            dst[i+1] = src1[i+1]-src2[i+1];
3312
            dst[i+2] = src1[i+2]-src2[i+2];
3313
            dst[i+3] = src1[i+3]-src2[i+3];
3314
            dst[i+4] = src1[i+4]-src2[i+4];
3315
            dst[i+5] = src1[i+5]-src2[i+5];
3316
            dst[i+6] = src1[i+6]-src2[i+6];
3317
            dst[i+7] = src1[i+7]-src2[i+7];
3318
        }
3319
    }else
3320
#endif
3321
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3322
        long a = *(long*)(src1+i);
3323
        long b = *(long*)(src2+i);
3324
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3325
    }
3326
    for(; i<w; i++)
3327
        dst[i+0] = src1[i+0]-src2[i+0];
3328
}
3329

    
3330
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3331
    int i;
3332
    uint8_t l, lt;
3333

    
3334
    l= *left;
3335
    lt= *left_top;
3336

    
3337
    for(i=0; i<w; i++){
3338
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3339
        lt= src1[i];
3340
        l= src2[i];
3341
        dst[i]= l - pred;
3342
    }
3343

    
3344
    *left= l;
3345
    *left_top= lt;
3346
}
3347

    
3348
#define BUTTERFLY2(o1,o2,i1,i2) \
3349
o1= (i1)+(i2);\
3350
o2= (i1)-(i2);
3351

    
3352
#define BUTTERFLY1(x,y) \
3353
{\
3354
    int a,b;\
3355
    a= x;\
3356
    b= y;\
3357
    x= a+b;\
3358
    y= a-b;\
3359
}
3360

    
3361
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3362

    
3363
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3364
    int i;
3365
    int temp[64];
3366
    int sum=0;
3367

    
3368
    assert(h==8);
3369

    
3370
    for(i=0; i<8; i++){
3371
        //FIXME try pointer walks
3372
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3373
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3374
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3375
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3376

    
3377
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3378
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3379
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3380
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3381

    
3382
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3383
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3384
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3385
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3386
    }
3387

    
3388
    for(i=0; i<8; i++){
3389
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3390
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3391
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3392
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3393

    
3394
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3395
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3396
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3397
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3398

    
3399
        sum +=
3400
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3401
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3402
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3403
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3404
    }
3405
#if 0
3406
static int maxi=0;
3407
if(sum>maxi){
3408
    maxi=sum;
3409
    printf("MAX:%d\n", maxi);
3410
}
3411
#endif
3412
    return sum;
3413
}
3414

    
3415
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3416
    int i;
3417
    int temp[64];
3418
    int sum=0;
3419

    
3420
    assert(h==8);
3421

    
3422
    for(i=0; i<8; i++){
3423
        //FIXME try pointer walks
3424
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3425
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3426
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3427
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3428

    
3429
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3430
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3431
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3432
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3433

    
3434
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3435
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3436
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3437
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3438
    }
3439

    
3440
    for(i=0; i<8; i++){
3441
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3442
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3443
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3444
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3445

    
3446
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3447
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3448
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3449
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3450

    
3451
        sum +=
3452
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3453
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3454
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3455
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3456
    }
3457

    
3458
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3459

    
3460
    return sum;
3461
}
3462

    
3463
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3464
    MpegEncContext * const s= (MpegEncContext *)c;
3465
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3466
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3467

    
3468
    assert(h==8);
3469

    
3470
    s->dsp.diff_pixels(temp, src1, src2, stride);
3471
    s->dsp.fdct(temp);
3472
    return s->dsp.sum_abs_dctelem(temp);
3473
}
3474

    
3475
#ifdef CONFIG_GPL
3476
#define DCT8_1D {\
3477
    const int s07 = SRC(0) + SRC(7);\
3478
    const int s16 = SRC(1) + SRC(6);\
3479
    const int s25 = SRC(2) + SRC(5);\
3480
    const int s34 = SRC(3) + SRC(4);\
3481
    const int a0 = s07 + s34;\
3482
    const int a1 = s16 + s25;\
3483
    const int a2 = s07 - s34;\
3484
    const int a3 = s16 - s25;\
3485
    const int d07 = SRC(0) - SRC(7);\
3486
    const int d16 = SRC(1) - SRC(6);\
3487
    const int d25 = SRC(2) - SRC(5);\
3488
    const int d34 = SRC(3) - SRC(4);\
3489
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3490
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3491
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3492
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3493
    DST(0,  a0 + a1     ) ;\
3494
    DST(1,  a4 + (a7>>2)) ;\
3495
    DST(2,  a2 + (a3>>1)) ;\
3496
    DST(3,  a5 + (a6>>2)) ;\
3497
    DST(4,  a0 - a1     ) ;\
3498
    DST(5,  a6 - (a5>>2)) ;\
3499
    DST(6, (a2>>1) - a3 ) ;\
3500
    DST(7, (a4>>2) - a7 ) ;\
3501
}
3502

    
3503
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3504
    MpegEncContext * const s= (MpegEncContext *)c;
3505
    DCTELEM dct[8][8];
3506
    int i;
3507
    int sum=0;
3508

    
3509
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3510

    
3511
#define SRC(x) dct[i][x]
3512
#define DST(x,v) dct[i][x]= v
3513
    for( i = 0; i < 8; i++ )
3514
        DCT8_1D
3515
#undef SRC
3516
#undef DST
3517

    
3518
#define SRC(x) dct[x][i]
3519
#define DST(x,v) sum += FFABS(v)
3520
    for( i = 0; i < 8; i++ )
3521
        DCT8_1D
3522
#undef SRC
3523
#undef DST
3524
    return sum;
3525
}
3526
#endif
3527

    
3528
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3529
    MpegEncContext * const s= (MpegEncContext *)c;
3530
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3531
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3532
    int sum=0, i;
3533

    
3534
    assert(h==8);
3535

    
3536
    s->dsp.diff_pixels(temp, src1, src2, stride);
3537
    s->dsp.fdct(temp);
3538

    
3539
    for(i=0; i<64; i++)
3540
        sum= FFMAX(sum, FFABS(temp[i]));
3541

    
3542
    return sum;
3543
}
3544

    
3545
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546
    MpegEncContext * const s= (MpegEncContext *)c;
3547
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3548
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3549
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3550
    int sum=0, i;
3551

    
3552
    assert(h==8);
3553
    s->mb_intra=0;
3554

    
3555
    s->dsp.diff_pixels(temp, src1, src2, stride);
3556

    
3557
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3558

    
3559
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3561
    ff_simple_idct(temp); //FIXME
3562

    
3563
    for(i=0; i<64; i++)
3564
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3565

    
3566
    return sum;
3567
}
3568

    
3569
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570
    MpegEncContext * const s= (MpegEncContext *)c;
3571
    const uint8_t *scantable= s->intra_scantable.permutated;
3572
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3573
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3574
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3575
    uint8_t * const bak= (uint8_t*)aligned_bak;
3576
    int i, last, run, bits, level, distoration, start_i;
3577
    const int esc_length= s->ac_esc_length;
3578
    uint8_t * length;
3579
    uint8_t * last_length;
3580

    
3581
    assert(h==8);
3582

    
3583
    for(i=0; i<8; i++){
3584
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3585
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3586
    }
3587

    
3588
    s->dsp.diff_pixels(temp, src1, src2, stride);
3589

    
3590
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591

    
3592
    bits=0;
3593

    
3594
    if (s->mb_intra) {
3595
        start_i = 1;
3596
        length     = s->intra_ac_vlc_length;
3597
        last_length= s->intra_ac_vlc_last_length;
3598
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599
    } else {
3600
        start_i = 0;
3601
        length     = s->inter_ac_vlc_length;
3602
        last_length= s->inter_ac_vlc_last_length;
3603
    }
3604

    
3605
    if(last>=start_i){
3606
        run=0;
3607
        for(i=start_i; i<last; i++){
3608
            int j= scantable[i];
3609
            level= temp[j];
3610

    
3611
            if(level){
3612
                level+=64;
3613
                if((level&(~127)) == 0){
3614
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615
                }else
3616
                    bits+= esc_length;
3617
                run=0;
3618
            }else
3619
                run++;
3620
        }
3621
        i= scantable[last];
3622

    
3623
        level= temp[i] + 64;
3624

    
3625
        assert(level - 64);
3626

    
3627
        if((level&(~127)) == 0){
3628
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629
        }else
3630
            bits+= esc_length;
3631

    
3632
    }
3633

    
3634
    if(last>=0){
3635
        if(s->mb_intra)
3636
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3637
        else
3638
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3639
    }
3640

    
3641
    s->dsp.idct_add(bak, stride, temp);
3642

    
3643
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3644

    
3645
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3646
}
3647

    
3648
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3649
    MpegEncContext * const s= (MpegEncContext *)c;
3650
    const uint8_t *scantable= s->intra_scantable.permutated;
3651
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3652
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3653
    int i, last, run, bits, level, start_i;
3654
    const int esc_length= s->ac_esc_length;
3655
    uint8_t * length;
3656
    uint8_t * last_length;
3657

    
3658
    assert(h==8);
3659

    
3660
    s->dsp.diff_pixels(temp, src1, src2, stride);
3661

    
3662
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3663

    
3664
    bits=0;
3665

    
3666
    if (s->mb_intra) {
3667
        start_i = 1;
3668
        length     = s->intra_ac_vlc_length;
3669
        last_length= s->intra_ac_vlc_last_length;
3670
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3671
    } else {
3672
        start_i = 0;
3673
        length     = s->inter_ac_vlc_length;
3674
        last_length= s->inter_ac_vlc_last_length;
3675
    }
3676

    
3677
    if(last>=start_i){
3678
        run=0;
3679
        for(i=start_i; i<last; i++){
3680
            int j= scantable[i];
3681
            level= temp[j];
3682

    
3683
            if(level){
3684
                level+=64;
3685
                if((level&(~127)) == 0){
3686
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3687
                }else
3688
                    bits+= esc_length;
3689
                run=0;
3690
            }else
3691
                run++;
3692
        }
3693
        i= scantable[last];
3694

    
3695
        level= temp[i] + 64;
3696

    
3697
        assert(level - 64);
3698

    
3699
        if((level&(~127)) == 0){
3700
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3701
        }else
3702
            bits+= esc_length;
3703
    }
3704

    
3705
    return bits;
3706
}
3707

    
3708
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3709
    int score=0;
3710
    int x,y;
3711

    
3712
    for(y=1; y<h; y++){
3713
        for(x=0; x<16; x+=4){
3714
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3715
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3716
        }
3717
        s+= stride;
3718
    }
3719

    
3720
    return score;
3721
}
3722

    
3723
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3724
    int score=0;
3725
    int x,y;
3726

    
3727
    for(y=1; y<h; y++){
3728
        for(x=0; x<16; x++){
3729
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3730
        }
3731
        s1+= stride;
3732
        s2+= stride;
3733
    }
3734

    
3735
    return score;
3736
}
3737

    
3738
#define SQ(a) ((a)*(a))
3739
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3740
    int score=0;
3741
    int x,y;
3742

    
3743
    for(y=1; y<h; y++){
3744
        for(x=0; x<16; x+=4){
3745
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3746
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3747
        }
3748
        s+= stride;
3749
    }
3750

    
3751
    return score;
3752
}
3753

    
3754
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3755
    int score=0;
3756
    int x,y;
3757

    
3758
    for(y=1; y<h; y++){
3759
        for(x=0; x<16; x++){
3760
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3761
        }
3762
        s1+= stride;
3763
        s2+= stride;
3764
    }
3765

    
3766
    return score;
3767
}
3768

    
3769
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3770
                               int size){
3771
    int score=0;
3772
    int i;
3773
    for(i=0; i<size; i++)
3774
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3775
    return score;
3776
}
3777

    
3778
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3779
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3780
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3781
#ifdef CONFIG_GPL
3782
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3783
#endif
3784
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3785
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3786
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3787
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3788

    
3789
static void vector_fmul_c(float *dst, const float *src, int len){
3790
    int i;
3791
    for(i=0; i<len; i++)
3792
        dst[i] *= src[i];
3793
}
3794

    
3795
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3796
    int i;
3797
    src1 += len-1;
3798
    for(i=0; i<len; i++)
3799
        dst[i] = src0[i] * src1[-i];
3800
}
3801

    
3802
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3803
    int i;
3804
    for(i=0; i<len; i++)
3805
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3806
}
3807

    
3808
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3809
    int i;
3810
    for(i=0; i<len; i++) {
3811
        int_fast32_t tmp = ((const int32_t*)src)[i];
3812
        if(tmp & 0xf0000){
3813
            tmp = (0x43c0ffff - tmp)>>31;
3814
            // is this faster on some gcc/cpu combinations?
3815
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3816
//          else                 tmp = 0;
3817
        }
3818
        dst[i] = tmp - 0x8000;
3819
    }
3820
}
3821

    
3822
#define W0 2048
3823
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3824
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3825
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3826
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3827
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3828
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3829
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3830

    
3831
static void wmv2_idct_row(short * b)
3832
{
3833
    int s1,s2;
3834
    int a0,a1,a2,a3,a4,a5,a6,a7;
3835
    /*step 1*/
3836
    a1 = W1*b[1]+W7*b[7];
3837
    a7 = W7*b[1]-W1*b[7];
3838
    a5 = W5*b[5]+W3*b[3];
3839
    a3 = W3*b[5]-W5*b[3];
3840
    a2 = W2*b[2]+W6*b[6];
3841
    a6 = W6*b[2]-W2*b[6];
3842
    a0 = W0*b[0]+W0*b[4];
3843
    a4 = W0*b[0]-W0*b[4];
3844
    /*step 2*/
3845
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3846
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3847
    /*step 3*/
3848
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3849
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3850
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3851
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3852
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3853
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3854
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3855
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3856
}
3857
static void wmv2_idct_col(short * b)
3858
{
3859
    int s1,s2;
3860
    int a0,a1,a2,a3,a4,a5,a6,a7;
3861
    /*step 1, with extended precision*/
3862
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3863
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3864
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3865
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3866
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3867
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3868
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3869
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3870
    /*step 2*/
3871
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3872
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3873
    /*step 3*/
3874
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3875
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3876
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3877
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3878

    
3879
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3880
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3881
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3882
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3883
}
3884
void ff_wmv2_idct_c(short * block){
3885
    int i;
3886

    
3887
    for(i=0;i<64;i+=8){
3888
        wmv2_idct_row(block+i);
3889
    }
3890
    for(i=0;i<8;i++){
3891
        wmv2_idct_col(block+i);
3892
    }
3893
}
3894
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3895
 converted */
3896
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3897
{
3898
    ff_wmv2_idct_c(block);
3899
    put_pixels_clamped_c(block, dest, line_size);
3900
}
3901
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3902
{
3903
    ff_wmv2_idct_c(block);
3904
    add_pixels_clamped_c(block, dest, line_size);
3905
}
3906
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3907
{
3908
    j_rev_dct (block);
3909
    put_pixels_clamped_c(block, dest, line_size);
3910
}
3911
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3912
{
3913
    j_rev_dct (block);
3914
    add_pixels_clamped_c(block, dest, line_size);
3915
}
3916

    
3917
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3918
{
3919
    j_rev_dct4 (block);
3920
    put_pixels_clamped4_c(block, dest, line_size);
3921
}
3922
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3923
{
3924
    j_rev_dct4 (block);
3925
    add_pixels_clamped4_c(block, dest, line_size);
3926
}
3927

    
3928
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3929
{
3930
    j_rev_dct2 (block);
3931
    put_pixels_clamped2_c(block, dest, line_size);
3932
}
3933
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3934
{
3935
    j_rev_dct2 (block);
3936
    add_pixels_clamped2_c(block, dest, line_size);
3937
}
3938

    
3939
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3940
{
3941
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3942

    
3943
    dest[0] = cm[(block[0] + 4)>>3];
3944
}
3945
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3946
{
3947
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3948

    
3949
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3950
}
3951

    
3952
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3953

    
3954
/* init static data */
3955
void dsputil_static_init(void)
3956
{
3957
    int i;
3958

    
3959
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3960
    for(i=0;i<MAX_NEG_CROP;i++) {
3961
        ff_cropTbl[i] = 0;
3962
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3963
    }
3964

    
3965
    for(i=0;i<512;i++) {
3966
        ff_squareTbl[i] = (i - 256) * (i - 256);
3967
    }
3968

    
3969
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3970
}
3971

    
3972
int ff_check_alignment(void){
3973
    static int did_fail=0;
3974
    DECLARE_ALIGNED_16(int, aligned);
3975

    
3976
    if((long)&aligned & 15){
3977
        if(!did_fail){
3978
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3979
            av_log(NULL, AV_LOG_ERROR,
3980
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3981
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3982
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3983
                "Do not report crashes to FFmpeg developers.\n");
3984
#endif
3985
            did_fail=1;
3986
        }
3987
        return -1;
3988
    }
3989
    return 0;
3990
}
3991

    
3992
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3993
{
3994
    int i;
3995

    
3996
    ff_check_alignment();
3997

    
3998
#ifdef CONFIG_ENCODERS
3999
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4000
        c->fdct = fdct_ifast;
4001
        c->fdct248 = fdct_ifast248;
4002
    }
4003
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4004
        c->fdct = ff_faandct;
4005
        c->fdct248 = ff_faandct248;
4006
    }
4007
    else {
4008
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4009
        c->fdct248 = ff_fdct248_islow;
4010
    }
4011
#endif //CONFIG_ENCODERS
4012

    
4013
    if(avctx->lowres==1){
4014
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4015
            c->idct_put= ff_jref_idct4_put;
4016
            c->idct_add= ff_jref_idct4_add;
4017
        }else{
4018
            c->idct_put= ff_h264_lowres_idct_put_c;
4019
            c->idct_add= ff_h264_lowres_idct_add_c;
4020
        }
4021
        c->idct    = j_rev_dct4;
4022
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4023
    }else if(avctx->lowres==2){
4024
        c->idct_put= ff_jref_idct2_put;
4025
        c->idct_add= ff_jref_idct2_add;
4026
        c->idct    = j_rev_dct2;
4027
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4028
    }else if(avctx->lowres==3){
4029
        c->idct_put= ff_jref_idct1_put;
4030
        c->idct_add= ff_jref_idct1_add;
4031
        c->idct    = j_rev_dct1;
4032
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4033
    }else{
4034
        if(avctx->idct_algo==FF_IDCT_INT){
4035
            c->idct_put= ff_jref_idct_put;
4036
            c->idct_add= ff_jref_idct_add;
4037
            c->idct    = j_rev_dct;
4038
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4039
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4040
                avctx->idct_algo==FF_IDCT_VP3){
4041
            c->idct_put= ff_vp3_idct_put_c;
4042
            c->idct_add= ff_vp3_idct_add_c;
4043
            c->idct    = ff_vp3_idct_c;
4044
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4045
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4046
            c->idct_put= ff_wmv2_idct_put_c;
4047
            c->idct_add= ff_wmv2_idct_add_c;
4048
            c->idct    = ff_wmv2_idct_c;
4049
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4050
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4051
            c->idct_put= ff_faanidct_put;
4052
            c->idct_add= ff_faanidct_add;
4053
            c->idct    = ff_faanidct;
4054
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4055
        }else{ //accurate/default
4056
            c->idct_put= ff_simple_idct_put;
4057
            c->idct_add= ff_simple_idct_add;
4058
            c->idct    = ff_simple_idct;
4059
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4060
        }
4061
    }
4062

    
4063
    if (ENABLE_H264_DECODER) {
4064
        c->h264_idct_add= ff_h264_idct_add_c;
4065
        c->h264_idct8_add= ff_h264_idct8_add_c;
4066
        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4067
        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4068
    }
4069

    
4070
    c->get_pixels = get_pixels_c;
4071
    c->diff_pixels = diff_pixels_c;
4072
    c->put_pixels_clamped = put_pixels_clamped_c;
4073
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4074
    c->add_pixels_clamped = add_pixels_clamped_c;
4075
    c->add_pixels8 = add_pixels8_c;
4076
    c->add_pixels4 = add_pixels4_c;
4077
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4078
    c->gmc1 = gmc1_c;
4079
    c->gmc = ff_gmc_c;
4080
    c->clear_blocks = clear_blocks_c;
4081
    c->pix_sum = pix_sum_c;
4082
    c->pix_norm1 = pix_norm1_c;
4083

    
4084
    /* TODO [0] 16  [1] 8 */
4085
    c->pix_abs[0][0] = pix_abs16_c;
4086
    c->pix_abs[0][1] = pix_abs16_x2_c;
4087
    c->pix_abs[0][2] = pix_abs16_y2_c;
4088
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4089
    c->pix_abs[1][0] = pix_abs8_c;
4090
    c->pix_abs[1][1] = pix_abs8_x2_c;
4091
    c->pix_abs[1][2] = pix_abs8_y2_c;
4092
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4093

    
4094
#define dspfunc(PFX, IDX, NUM) \
4095
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4096
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4097
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4098
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4099

    
4100
    dspfunc(put, 0, 16);
4101
    dspfunc(put_no_rnd, 0, 16);
4102
    dspfunc(put, 1, 8);
4103
    dspfunc(put_no_rnd, 1, 8);
4104
    dspfunc(put, 2, 4);
4105
    dspfunc(put, 3, 2);
4106

    
4107
    dspfunc(avg, 0, 16);
4108
    dspfunc(avg_no_rnd, 0, 16);
4109
    dspfunc(avg, 1, 8);
4110
    dspfunc(avg_no_rnd, 1, 8);
4111
    dspfunc(avg, 2, 4);
4112
    dspfunc(avg, 3, 2);
4113
#undef dspfunc
4114

    
4115
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4116
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4117

    
4118
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4119
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4120
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4121
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4122
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4123
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4124
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4125
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4126
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4127

    
4128
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4129
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4130
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4131
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4132
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4133
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4134
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4135
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4136
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4137

    
4138
#define dspfunc(PFX, IDX, NUM) \
4139
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4140
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4141
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4142
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4143
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4144
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4145
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4146
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4147
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4148
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4149
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4150
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4151
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4152
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4153
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4154
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4155

    
4156
    dspfunc(put_qpel, 0, 16);
4157
    dspfunc(put_no_rnd_qpel, 0, 16);
4158

    
4159
    dspfunc(avg_qpel, 0, 16);
4160
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4161

    
4162
    dspfunc(put_qpel, 1, 8);
4163
    dspfunc(put_no_rnd_qpel, 1, 8);
4164

    
4165
    dspfunc(avg_qpel, 1, 8);
4166
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4167

    
4168
    dspfunc(put_h264_qpel, 0, 16);
4169
    dspfunc(put_h264_qpel, 1, 8);
4170
    dspfunc(put_h264_qpel, 2, 4);
4171
    dspfunc(put_h264_qpel, 3, 2);
4172
    dspfunc(avg_h264_qpel, 0, 16);
4173
    dspfunc(avg_h264_qpel, 1, 8);
4174
    dspfunc(avg_h264_qpel, 2, 4);
4175

    
4176
#undef dspfunc
4177
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4178
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4179
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4180
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4181
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4182
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4183
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4184

    
4185
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4186
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4187
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4188
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4189
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4190
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4191
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4192
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4193
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4194
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4195
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4196
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4197
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4198
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4199
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4200
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4201
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4202
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4203
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4204
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4205

    
4206
#ifdef CONFIG_CAVS_DECODER
4207
    ff_cavsdsp_init(c,avctx);
4208
#endif
4209
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4210
    ff_vc1dsp_init(c,avctx);
4211
#endif
4212
#if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4213
    ff_intrax8dsp_init(c,avctx);
4214
#endif
4215
#if defined(CONFIG_H264_ENCODER)
4216
    ff_h264dspenc_init(c,avctx);
4217
#endif
4218

    
4219
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4220
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4221
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4222