Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ f315b394

History | View | Annotate | Download (153 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* flacenc.c */
45
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
46

    
47
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
48
uint32_t ff_squareTbl[512] = {0, };
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
100
const uint32_t ff_inverse[256]={
101
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
102
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
103
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
104
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
105
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
106
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
107
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
108
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
109
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
110
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
111
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
112
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
113
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
114
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
115
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
116
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
117
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
118
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
119
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
120
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
121
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
122
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
123
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
124
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
125
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
126
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
127
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
128
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
129
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
130
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
131
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
132
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
133
};
134

    
135
/* Input permutation for the simple_idct_mmx */
136
static const uint8_t simple_mmx_permutation[64]={
137
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
138
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
139
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
140
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
141
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
142
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
143
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
144
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
145
};
146

    
147
static int pix_sum_c(uint8_t * pix, int line_size)
148
{
149
    int s, i, j;
150

    
151
    s = 0;
152
    for (i = 0; i < 16; i++) {
153
        for (j = 0; j < 16; j += 8) {
154
            s += pix[0];
155
            s += pix[1];
156
            s += pix[2];
157
            s += pix[3];
158
            s += pix[4];
159
            s += pix[5];
160
            s += pix[6];
161
            s += pix[7];
162
            pix += 8;
163
        }
164
        pix += line_size - 16;
165
    }
166
    return s;
167
}
168

    
169
static int pix_norm1_c(uint8_t * pix, int line_size)
170
{
171
    int s, i, j;
172
    uint32_t *sq = ff_squareTbl + 256;
173

    
174
    s = 0;
175
    for (i = 0; i < 16; i++) {
176
        for (j = 0; j < 16; j += 8) {
177
#if 0
178
            s += sq[pix[0]];
179
            s += sq[pix[1]];
180
            s += sq[pix[2]];
181
            s += sq[pix[3]];
182
            s += sq[pix[4]];
183
            s += sq[pix[5]];
184
            s += sq[pix[6]];
185
            s += sq[pix[7]];
186
#else
187
#if LONG_MAX > 2147483647
188
            register uint64_t x=*(uint64_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            s += sq[(x>>32)&0xff];
194
            s += sq[(x>>40)&0xff];
195
            s += sq[(x>>48)&0xff];
196
            s += sq[(x>>56)&0xff];
197
#else
198
            register uint32_t x=*(uint32_t*)pix;
199
            s += sq[x&0xff];
200
            s += sq[(x>>8)&0xff];
201
            s += sq[(x>>16)&0xff];
202
            s += sq[(x>>24)&0xff];
203
            x=*(uint32_t*)(pix+4);
204
            s += sq[x&0xff];
205
            s += sq[(x>>8)&0xff];
206
            s += sq[(x>>16)&0xff];
207
            s += sq[(x>>24)&0xff];
208
#endif
209
#endif
210
            pix += 8;
211
        }
212
        pix += line_size - 16;
213
    }
214
    return s;
215
}
216

    
217
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
218
    int i;
219

    
220
    for(i=0; i+8<=w; i+=8){
221
        dst[i+0]= bswap_32(src[i+0]);
222
        dst[i+1]= bswap_32(src[i+1]);
223
        dst[i+2]= bswap_32(src[i+2]);
224
        dst[i+3]= bswap_32(src[i+3]);
225
        dst[i+4]= bswap_32(src[i+4]);
226
        dst[i+5]= bswap_32(src[i+5]);
227
        dst[i+6]= bswap_32(src[i+6]);
228
        dst[i+7]= bswap_32(src[i+7]);
229
    }
230
    for(;i<w; i++){
231
        dst[i+0]= bswap_32(src[i+0]);
232
    }
233
}
234

    
235
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
236
{
237
    int s, i;
238
    uint32_t *sq = ff_squareTbl + 256;
239

    
240
    s = 0;
241
    for (i = 0; i < h; i++) {
242
        s += sq[pix1[0] - pix2[0]];
243
        s += sq[pix1[1] - pix2[1]];
244
        s += sq[pix1[2] - pix2[2]];
245
        s += sq[pix1[3] - pix2[3]];
246
        pix1 += line_size;
247
        pix2 += line_size;
248
    }
249
    return s;
250
}
251

    
252
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
253
{
254
    int s, i;
255
    uint32_t *sq = ff_squareTbl + 256;
256

    
257
    s = 0;
258
    for (i = 0; i < h; i++) {
259
        s += sq[pix1[0] - pix2[0]];
260
        s += sq[pix1[1] - pix2[1]];
261
        s += sq[pix1[2] - pix2[2]];
262
        s += sq[pix1[3] - pix2[3]];
263
        s += sq[pix1[4] - pix2[4]];
264
        s += sq[pix1[5] - pix2[5]];
265
        s += sq[pix1[6] - pix2[6]];
266
        s += sq[pix1[7] - pix2[7]];
267
        pix1 += line_size;
268
        pix2 += line_size;
269
    }
270
    return s;
271
}
272

    
273
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
274
{
275
    int s, i;
276
    uint32_t *sq = ff_squareTbl + 256;
277

    
278
    s = 0;
279
    for (i = 0; i < h; i++) {
280
        s += sq[pix1[ 0] - pix2[ 0]];
281
        s += sq[pix1[ 1] - pix2[ 1]];
282
        s += sq[pix1[ 2] - pix2[ 2]];
283
        s += sq[pix1[ 3] - pix2[ 3]];
284
        s += sq[pix1[ 4] - pix2[ 4]];
285
        s += sq[pix1[ 5] - pix2[ 5]];
286
        s += sq[pix1[ 6] - pix2[ 6]];
287
        s += sq[pix1[ 7] - pix2[ 7]];
288
        s += sq[pix1[ 8] - pix2[ 8]];
289
        s += sq[pix1[ 9] - pix2[ 9]];
290
        s += sq[pix1[10] - pix2[10]];
291
        s += sq[pix1[11] - pix2[11]];
292
        s += sq[pix1[12] - pix2[12]];
293
        s += sq[pix1[13] - pix2[13]];
294
        s += sq[pix1[14] - pix2[14]];
295
        s += sq[pix1[15] - pix2[15]];
296

    
297
        pix1 += line_size;
298
        pix2 += line_size;
299
    }
300
    return s;
301
}
302

    
303

    
304
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
305
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
306
    int s, i, j;
307
    const int dec_count= w==8 ? 3 : 4;
308
    int tmp[32*32];
309
    int level, ori;
310
    static const int scale[2][2][4][4]={
311
      {
312
        {
313
            // 9/7 8x8 dec=3
314
            {268, 239, 239, 213},
315
            {  0, 224, 224, 152},
316
            {  0, 135, 135, 110},
317
        },{
318
            // 9/7 16x16 or 32x32 dec=4
319
            {344, 310, 310, 280},
320
            {  0, 320, 320, 228},
321
            {  0, 175, 175, 136},
322
            {  0, 129, 129, 102},
323
        }
324
      },{
325
        {
326
            // 5/3 8x8 dec=3
327
            {275, 245, 245, 218},
328
            {  0, 230, 230, 156},
329
            {  0, 138, 138, 113},
330
        },{
331
            // 5/3 16x16 or 32x32 dec=4
332
            {352, 317, 317, 286},
333
            {  0, 328, 328, 233},
334
            {  0, 180, 180, 140},
335
            {  0, 132, 132, 105},
336
        }
337
      }
338
    };
339

    
340
    for (i = 0; i < h; i++) {
341
        for (j = 0; j < w; j+=4) {
342
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
343
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
344
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
345
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
346
        }
347
        pix1 += line_size;
348
        pix2 += line_size;
349
    }
350

    
351
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
352

    
353
    s=0;
354
    assert(w==h);
355
    for(level=0; level<dec_count; level++){
356
        for(ori= level ? 1 : 0; ori<4; ori++){
357
            int size= w>>(dec_count-level);
358
            int sx= (ori&1) ? size : 0;
359
            int stride= 32<<(dec_count-level);
360
            int sy= (ori&2) ? stride>>1 : 0;
361

    
362
            for(i=0; i<size; i++){
363
                for(j=0; j<size; j++){
364
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
365
                    s += FFABS(v);
366
                }
367
            }
368
        }
369
    }
370
    assert(s>=0);
371
    return s>>9;
372
}
373

    
374
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
376
}
377

    
378
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
380
}
381

    
382
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
384
}
385

    
386
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
388
}
389

    
390
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
392
}
393

    
394
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
396
}
397
#endif
398

    
399
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
400
{
401
    int i;
402

    
403
    /* read the pixels */
404
    for(i=0;i<8;i++) {
405
        block[0] = pixels[0];
406
        block[1] = pixels[1];
407
        block[2] = pixels[2];
408
        block[3] = pixels[3];
409
        block[4] = pixels[4];
410
        block[5] = pixels[5];
411
        block[6] = pixels[6];
412
        block[7] = pixels[7];
413
        pixels += line_size;
414
        block += 8;
415
    }
416
}
417

    
418
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
419
                          const uint8_t *s2, int stride){
420
    int i;
421

    
422
    /* read the pixels */
423
    for(i=0;i<8;i++) {
424
        block[0] = s1[0] - s2[0];
425
        block[1] = s1[1] - s2[1];
426
        block[2] = s1[2] - s2[2];
427
        block[3] = s1[3] - s2[3];
428
        block[4] = s1[4] - s2[4];
429
        block[5] = s1[5] - s2[5];
430
        block[6] = s1[6] - s2[6];
431
        block[7] = s1[7] - s2[7];
432
        s1 += stride;
433
        s2 += stride;
434
        block += 8;
435
    }
436
}
437

    
438

    
439
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
440
                                 int line_size)
441
{
442
    int i;
443
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
444

    
445
    /* read the pixels */
446
    for(i=0;i<8;i++) {
447
        pixels[0] = cm[block[0]];
448
        pixels[1] = cm[block[1]];
449
        pixels[2] = cm[block[2]];
450
        pixels[3] = cm[block[3]];
451
        pixels[4] = cm[block[4]];
452
        pixels[5] = cm[block[5]];
453
        pixels[6] = cm[block[6]];
454
        pixels[7] = cm[block[7]];
455

    
456
        pixels += line_size;
457
        block += 8;
458
    }
459
}
460

    
461
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462
                                 int line_size)
463
{
464
    int i;
465
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
466

    
467
    /* read the pixels */
468
    for(i=0;i<4;i++) {
469
        pixels[0] = cm[block[0]];
470
        pixels[1] = cm[block[1]];
471
        pixels[2] = cm[block[2]];
472
        pixels[3] = cm[block[3]];
473

    
474
        pixels += line_size;
475
        block += 8;
476
    }
477
}
478

    
479
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
480
                                 int line_size)
481
{
482
    int i;
483
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
484

    
485
    /* read the pixels */
486
    for(i=0;i<2;i++) {
487
        pixels[0] = cm[block[0]];
488
        pixels[1] = cm[block[1]];
489

    
490
        pixels += line_size;
491
        block += 8;
492
    }
493
}
494

    
495
static void put_signed_pixels_clamped_c(const DCTELEM *block,
496
                                        uint8_t *restrict pixels,
497
                                        int line_size)
498
{
499
    int i, j;
500

    
501
    for (i = 0; i < 8; i++) {
502
        for (j = 0; j < 8; j++) {
503
            if (*block < -128)
504
                *pixels = 0;
505
            else if (*block > 127)
506
                *pixels = 255;
507
            else
508
                *pixels = (uint8_t)(*block + 128);
509
            block++;
510
            pixels++;
511
        }
512
        pixels += (line_size - 8);
513
    }
514
}
515

    
516
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
517
                          int line_size)
518
{
519
    int i;
520
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
521

    
522
    /* read the pixels */
523
    for(i=0;i<8;i++) {
524
        pixels[0] = cm[pixels[0] + block[0]];
525
        pixels[1] = cm[pixels[1] + block[1]];
526
        pixels[2] = cm[pixels[2] + block[2]];
527
        pixels[3] = cm[pixels[3] + block[3]];
528
        pixels[4] = cm[pixels[4] + block[4]];
529
        pixels[5] = cm[pixels[5] + block[5]];
530
        pixels[6] = cm[pixels[6] + block[6]];
531
        pixels[7] = cm[pixels[7] + block[7]];
532
        pixels += line_size;
533
        block += 8;
534
    }
535
}
536

    
537
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
538
                          int line_size)
539
{
540
    int i;
541
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
542

    
543
    /* read the pixels */
544
    for(i=0;i<4;i++) {
545
        pixels[0] = cm[pixels[0] + block[0]];
546
        pixels[1] = cm[pixels[1] + block[1]];
547
        pixels[2] = cm[pixels[2] + block[2]];
548
        pixels[3] = cm[pixels[3] + block[3]];
549
        pixels += line_size;
550
        block += 8;
551
    }
552
}
553

    
554
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
555
                          int line_size)
556
{
557
    int i;
558
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
559

    
560
    /* read the pixels */
561
    for(i=0;i<2;i++) {
562
        pixels[0] = cm[pixels[0] + block[0]];
563
        pixels[1] = cm[pixels[1] + block[1]];
564
        pixels += line_size;
565
        block += 8;
566
    }
567
}
568

    
569
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
570
{
571
    int i;
572
    for(i=0;i<8;i++) {
573
        pixels[0] += block[0];
574
        pixels[1] += block[1];
575
        pixels[2] += block[2];
576
        pixels[3] += block[3];
577
        pixels[4] += block[4];
578
        pixels[5] += block[5];
579
        pixels[6] += block[6];
580
        pixels[7] += block[7];
581
        pixels += line_size;
582
        block += 8;
583
    }
584
}
585

    
586
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
587
{
588
    int i;
589
    for(i=0;i<4;i++) {
590
        pixels[0] += block[0];
591
        pixels[1] += block[1];
592
        pixels[2] += block[2];
593
        pixels[3] += block[3];
594
        pixels += line_size;
595
        block += 4;
596
    }
597
}
598

    
599
static int sum_abs_dctelem_c(DCTELEM *block)
600
{
601
    int sum=0, i;
602
    for(i=0; i<64; i++)
603
        sum+= FFABS(block[i]);
604
    return sum;
605
}
606

    
607
#if 0
608

609
#define PIXOP2(OPNAME, OP) \
610
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
611
{\
612
    int i;\
613
    for(i=0; i<h; i++){\
614
        OP(*((uint64_t*)block), AV_RN64(pixels));\
615
        pixels+=line_size;\
616
        block +=line_size;\
617
    }\
618
}\
619
\
620
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621
{\
622
    int i;\
623
    for(i=0; i<h; i++){\
624
        const uint64_t a= AV_RN64(pixels  );\
625
        const uint64_t b= AV_RN64(pixels+1);\
626
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627
        pixels+=line_size;\
628
        block +=line_size;\
629
    }\
630
}\
631
\
632
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633
{\
634
    int i;\
635
    for(i=0; i<h; i++){\
636
        const uint64_t a= AV_RN64(pixels  );\
637
        const uint64_t b= AV_RN64(pixels+1);\
638
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639
        pixels+=line_size;\
640
        block +=line_size;\
641
    }\
642
}\
643
\
644
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645
{\
646
    int i;\
647
    for(i=0; i<h; i++){\
648
        const uint64_t a= AV_RN64(pixels          );\
649
        const uint64_t b= AV_RN64(pixels+line_size);\
650
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651
        pixels+=line_size;\
652
        block +=line_size;\
653
    }\
654
}\
655
\
656
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657
{\
658
    int i;\
659
    for(i=0; i<h; i++){\
660
        const uint64_t a= AV_RN64(pixels          );\
661
        const uint64_t b= AV_RN64(pixels+line_size);\
662
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
663
        pixels+=line_size;\
664
        block +=line_size;\
665
    }\
666
}\
667
\
668
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
669
{\
670
        int i;\
671
        const uint64_t a= AV_RN64(pixels  );\
672
        const uint64_t b= AV_RN64(pixels+1);\
673
        uint64_t l0=  (a&0x0303030303030303ULL)\
674
                    + (b&0x0303030303030303ULL)\
675
                    + 0x0202020202020202ULL;\
676
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
677
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
678
        uint64_t l1,h1;\
679
\
680
        pixels+=line_size;\
681
        for(i=0; i<h; i+=2){\
682
            uint64_t a= AV_RN64(pixels  );\
683
            uint64_t b= AV_RN64(pixels+1);\
684
            l1=  (a&0x0303030303030303ULL)\
685
               + (b&0x0303030303030303ULL);\
686
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
687
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
688
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
689
            pixels+=line_size;\
690
            block +=line_size;\
691
            a= AV_RN64(pixels  );\
692
            b= AV_RN64(pixels+1);\
693
            l0=  (a&0x0303030303030303ULL)\
694
               + (b&0x0303030303030303ULL)\
695
               + 0x0202020202020202ULL;\
696
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
697
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
698
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
699
            pixels+=line_size;\
700
            block +=line_size;\
701
        }\
702
}\
703
\
704
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705
{\
706
        int i;\
707
        const uint64_t a= AV_RN64(pixels  );\
708
        const uint64_t b= AV_RN64(pixels+1);\
709
        uint64_t l0=  (a&0x0303030303030303ULL)\
710
                    + (b&0x0303030303030303ULL)\
711
                    + 0x0101010101010101ULL;\
712
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
713
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
714
        uint64_t l1,h1;\
715
\
716
        pixels+=line_size;\
717
        for(i=0; i<h; i+=2){\
718
            uint64_t a= AV_RN64(pixels  );\
719
            uint64_t b= AV_RN64(pixels+1);\
720
            l1=  (a&0x0303030303030303ULL)\
721
               + (b&0x0303030303030303ULL);\
722
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
723
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
724
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
725
            pixels+=line_size;\
726
            block +=line_size;\
727
            a= AV_RN64(pixels  );\
728
            b= AV_RN64(pixels+1);\
729
            l0=  (a&0x0303030303030303ULL)\
730
               + (b&0x0303030303030303ULL)\
731
               + 0x0101010101010101ULL;\
732
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
733
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
735
            pixels+=line_size;\
736
            block +=line_size;\
737
        }\
738
}\
739
\
740
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
741
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
742
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
743
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
744
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
745
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
746
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
747

748
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
749
#else // 64 bit variant
750

    
751
#define PIXOP2(OPNAME, OP) \
752
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
753
    int i;\
754
    for(i=0; i<h; i++){\
755
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
756
        pixels+=line_size;\
757
        block +=line_size;\
758
    }\
759
}\
760
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761
    int i;\
762
    for(i=0; i<h; i++){\
763
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
764
        pixels+=line_size;\
765
        block +=line_size;\
766
    }\
767
}\
768
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
769
    int i;\
770
    for(i=0; i<h; i++){\
771
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
772
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
773
        pixels+=line_size;\
774
        block +=line_size;\
775
    }\
776
}\
777
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
778
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
779
}\
780
\
781
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782
                                                int src_stride1, int src_stride2, int h){\
783
    int i;\
784
    for(i=0; i<h; i++){\
785
        uint32_t a,b;\
786
        a= AV_RN32(&src1[i*src_stride1  ]);\
787
        b= AV_RN32(&src2[i*src_stride2  ]);\
788
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
789
        a= AV_RN32(&src1[i*src_stride1+4]);\
790
        b= AV_RN32(&src2[i*src_stride2+4]);\
791
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
792
    }\
793
}\
794
\
795
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796
                                                int src_stride1, int src_stride2, int h){\
797
    int i;\
798
    for(i=0; i<h; i++){\
799
        uint32_t a,b;\
800
        a= AV_RN32(&src1[i*src_stride1  ]);\
801
        b= AV_RN32(&src2[i*src_stride2  ]);\
802
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
803
        a= AV_RN32(&src1[i*src_stride1+4]);\
804
        b= AV_RN32(&src2[i*src_stride2+4]);\
805
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
806
    }\
807
}\
808
\
809
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
810
                                                int src_stride1, int src_stride2, int h){\
811
    int i;\
812
    for(i=0; i<h; i++){\
813
        uint32_t a,b;\
814
        a= AV_RN32(&src1[i*src_stride1  ]);\
815
        b= AV_RN32(&src2[i*src_stride2  ]);\
816
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
817
    }\
818
}\
819
\
820
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821
                                                int src_stride1, int src_stride2, int h){\
822
    int i;\
823
    for(i=0; i<h; i++){\
824
        uint32_t a,b;\
825
        a= AV_RN16(&src1[i*src_stride1  ]);\
826
        b= AV_RN16(&src2[i*src_stride2  ]);\
827
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
828
    }\
829
}\
830
\
831
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
832
                                                int src_stride1, int src_stride2, int h){\
833
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
834
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
835
}\
836
\
837
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838
                                                int src_stride1, int src_stride2, int h){\
839
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
840
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
841
}\
842
\
843
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
845
}\
846
\
847
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
849
}\
850
\
851
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
853
}\
854
\
855
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
857
}\
858
\
859
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
860
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
861
    int i;\
862
    for(i=0; i<h; i++){\
863
        uint32_t a, b, c, d, l0, l1, h0, h1;\
864
        a= AV_RN32(&src1[i*src_stride1]);\
865
        b= AV_RN32(&src2[i*src_stride2]);\
866
        c= AV_RN32(&src3[i*src_stride3]);\
867
        d= AV_RN32(&src4[i*src_stride4]);\
868
        l0=  (a&0x03030303UL)\
869
           + (b&0x03030303UL)\
870
           + 0x02020202UL;\
871
        h0= ((a&0xFCFCFCFCUL)>>2)\
872
          + ((b&0xFCFCFCFCUL)>>2);\
873
        l1=  (c&0x03030303UL)\
874
           + (d&0x03030303UL);\
875
        h1= ((c&0xFCFCFCFCUL)>>2)\
876
          + ((d&0xFCFCFCFCUL)>>2);\
877
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878
        a= AV_RN32(&src1[i*src_stride1+4]);\
879
        b= AV_RN32(&src2[i*src_stride2+4]);\
880
        c= AV_RN32(&src3[i*src_stride3+4]);\
881
        d= AV_RN32(&src4[i*src_stride4+4]);\
882
        l0=  (a&0x03030303UL)\
883
           + (b&0x03030303UL)\
884
           + 0x02020202UL;\
885
        h0= ((a&0xFCFCFCFCUL)>>2)\
886
          + ((b&0xFCFCFCFCUL)>>2);\
887
        l1=  (c&0x03030303UL)\
888
           + (d&0x03030303UL);\
889
        h1= ((c&0xFCFCFCFCUL)>>2)\
890
          + ((d&0xFCFCFCFCUL)>>2);\
891
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
892
    }\
893
}\
894
\
895
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
897
}\
898
\
899
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
901
}\
902
\
903
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
905
}\
906
\
907
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
909
}\
910
\
911
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
912
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
913
    int i;\
914
    for(i=0; i<h; i++){\
915
        uint32_t a, b, c, d, l0, l1, h0, h1;\
916
        a= AV_RN32(&src1[i*src_stride1]);\
917
        b= AV_RN32(&src2[i*src_stride2]);\
918
        c= AV_RN32(&src3[i*src_stride3]);\
919
        d= AV_RN32(&src4[i*src_stride4]);\
920
        l0=  (a&0x03030303UL)\
921
           + (b&0x03030303UL)\
922
           + 0x01010101UL;\
923
        h0= ((a&0xFCFCFCFCUL)>>2)\
924
          + ((b&0xFCFCFCFCUL)>>2);\
925
        l1=  (c&0x03030303UL)\
926
           + (d&0x03030303UL);\
927
        h1= ((c&0xFCFCFCFCUL)>>2)\
928
          + ((d&0xFCFCFCFCUL)>>2);\
929
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930
        a= AV_RN32(&src1[i*src_stride1+4]);\
931
        b= AV_RN32(&src2[i*src_stride2+4]);\
932
        c= AV_RN32(&src3[i*src_stride3+4]);\
933
        d= AV_RN32(&src4[i*src_stride4+4]);\
934
        l0=  (a&0x03030303UL)\
935
           + (b&0x03030303UL)\
936
           + 0x01010101UL;\
937
        h0= ((a&0xFCFCFCFCUL)>>2)\
938
          + ((b&0xFCFCFCFCUL)>>2);\
939
        l1=  (c&0x03030303UL)\
940
           + (d&0x03030303UL);\
941
        h1= ((c&0xFCFCFCFCUL)>>2)\
942
          + ((d&0xFCFCFCFCUL)>>2);\
943
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
944
    }\
945
}\
946
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
947
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
948
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
949
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
950
}\
951
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
952
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
953
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
955
}\
956
\
957
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
958
{\
959
        int i, a0, b0, a1, b1;\
960
        a0= pixels[0];\
961
        b0= pixels[1] + 2;\
962
        a0 += b0;\
963
        b0 += pixels[2];\
964
\
965
        pixels+=line_size;\
966
        for(i=0; i<h; i+=2){\
967
            a1= pixels[0];\
968
            b1= pixels[1];\
969
            a1 += b1;\
970
            b1 += pixels[2];\
971
\
972
            block[0]= (a1+a0)>>2; /* FIXME non put */\
973
            block[1]= (b1+b0)>>2;\
974
\
975
            pixels+=line_size;\
976
            block +=line_size;\
977
\
978
            a0= pixels[0];\
979
            b0= pixels[1] + 2;\
980
            a0 += b0;\
981
            b0 += pixels[2];\
982
\
983
            block[0]= (a1+a0)>>2;\
984
            block[1]= (b1+b0)>>2;\
985
            pixels+=line_size;\
986
            block +=line_size;\
987
        }\
988
}\
989
\
990
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
991
{\
992
        int i;\
993
        const uint32_t a= AV_RN32(pixels  );\
994
        const uint32_t b= AV_RN32(pixels+1);\
995
        uint32_t l0=  (a&0x03030303UL)\
996
                    + (b&0x03030303UL)\
997
                    + 0x02020202UL;\
998
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
999
                   + ((b&0xFCFCFCFCUL)>>2);\
1000
        uint32_t l1,h1;\
1001
\
1002
        pixels+=line_size;\
1003
        for(i=0; i<h; i+=2){\
1004
            uint32_t a= AV_RN32(pixels  );\
1005
            uint32_t b= AV_RN32(pixels+1);\
1006
            l1=  (a&0x03030303UL)\
1007
               + (b&0x03030303UL);\
1008
            h1= ((a&0xFCFCFCFCUL)>>2)\
1009
              + ((b&0xFCFCFCFCUL)>>2);\
1010
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011
            pixels+=line_size;\
1012
            block +=line_size;\
1013
            a= AV_RN32(pixels  );\
1014
            b= AV_RN32(pixels+1);\
1015
            l0=  (a&0x03030303UL)\
1016
               + (b&0x03030303UL)\
1017
               + 0x02020202UL;\
1018
            h0= ((a&0xFCFCFCFCUL)>>2)\
1019
              + ((b&0xFCFCFCFCUL)>>2);\
1020
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021
            pixels+=line_size;\
1022
            block +=line_size;\
1023
        }\
1024
}\
1025
\
1026
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1027
{\
1028
    int j;\
1029
    for(j=0; j<2; j++){\
1030
        int i;\
1031
        const uint32_t a= AV_RN32(pixels  );\
1032
        const uint32_t b= AV_RN32(pixels+1);\
1033
        uint32_t l0=  (a&0x03030303UL)\
1034
                    + (b&0x03030303UL)\
1035
                    + 0x02020202UL;\
1036
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037
                   + ((b&0xFCFCFCFCUL)>>2);\
1038
        uint32_t l1,h1;\
1039
\
1040
        pixels+=line_size;\
1041
        for(i=0; i<h; i+=2){\
1042
            uint32_t a= AV_RN32(pixels  );\
1043
            uint32_t b= AV_RN32(pixels+1);\
1044
            l1=  (a&0x03030303UL)\
1045
               + (b&0x03030303UL);\
1046
            h1= ((a&0xFCFCFCFCUL)>>2)\
1047
              + ((b&0xFCFCFCFCUL)>>2);\
1048
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049
            pixels+=line_size;\
1050
            block +=line_size;\
1051
            a= AV_RN32(pixels  );\
1052
            b= AV_RN32(pixels+1);\
1053
            l0=  (a&0x03030303UL)\
1054
               + (b&0x03030303UL)\
1055
               + 0x02020202UL;\
1056
            h0= ((a&0xFCFCFCFCUL)>>2)\
1057
              + ((b&0xFCFCFCFCUL)>>2);\
1058
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1059
            pixels+=line_size;\
1060
            block +=line_size;\
1061
        }\
1062
        pixels+=4-line_size*(h+1);\
1063
        block +=4-line_size*h;\
1064
    }\
1065
}\
1066
\
1067
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1068
{\
1069
    int j;\
1070
    for(j=0; j<2; j++){\
1071
        int i;\
1072
        const uint32_t a= AV_RN32(pixels  );\
1073
        const uint32_t b= AV_RN32(pixels+1);\
1074
        uint32_t l0=  (a&0x03030303UL)\
1075
                    + (b&0x03030303UL)\
1076
                    + 0x01010101UL;\
1077
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078
                   + ((b&0xFCFCFCFCUL)>>2);\
1079
        uint32_t l1,h1;\
1080
\
1081
        pixels+=line_size;\
1082
        for(i=0; i<h; i+=2){\
1083
            uint32_t a= AV_RN32(pixels  );\
1084
            uint32_t b= AV_RN32(pixels+1);\
1085
            l1=  (a&0x03030303UL)\
1086
               + (b&0x03030303UL);\
1087
            h1= ((a&0xFCFCFCFCUL)>>2)\
1088
              + ((b&0xFCFCFCFCUL)>>2);\
1089
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090
            pixels+=line_size;\
1091
            block +=line_size;\
1092
            a= AV_RN32(pixels  );\
1093
            b= AV_RN32(pixels+1);\
1094
            l0=  (a&0x03030303UL)\
1095
               + (b&0x03030303UL)\
1096
               + 0x01010101UL;\
1097
            h0= ((a&0xFCFCFCFCUL)>>2)\
1098
              + ((b&0xFCFCFCFCUL)>>2);\
1099
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100
            pixels+=line_size;\
1101
            block +=line_size;\
1102
        }\
1103
        pixels+=4-line_size*(h+1);\
1104
        block +=4-line_size*h;\
1105
    }\
1106
}\
1107
\
1108
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1109
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1113
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1116

    
1117
#define op_avg(a, b) a = rnd_avg32(a, b)
1118
#endif
1119
#define op_put(a, b) a = b
1120

    
1121
PIXOP2(avg, op_avg)
1122
PIXOP2(put, op_put)
1123
#undef op_avg
1124
#undef op_put
1125

    
1126
#define avg2(a,b) ((a+b+1)>>1)
1127
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1128

    
1129
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1131
}
1132

    
1133
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1135
}
1136

    
1137
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1138
{
1139
    const int A=(16-x16)*(16-y16);
1140
    const int B=(   x16)*(16-y16);
1141
    const int C=(16-x16)*(   y16);
1142
    const int D=(   x16)*(   y16);
1143
    int i;
1144

    
1145
    for(i=0; i<h; i++)
1146
    {
1147
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1155
        dst+= stride;
1156
        src+= stride;
1157
    }
1158
}
1159

    
1160
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1161
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1162
{
1163
    int y, vx, vy;
1164
    const int s= 1<<shift;
1165

    
1166
    width--;
1167
    height--;
1168

    
1169
    for(y=0; y<h; y++){
1170
        int x;
1171

    
1172
        vx= ox;
1173
        vy= oy;
1174
        for(x=0; x<8; x++){ //XXX FIXME optimize
1175
            int src_x, src_y, frac_x, frac_y, index;
1176

    
1177
            src_x= vx>>16;
1178
            src_y= vy>>16;
1179
            frac_x= src_x&(s-1);
1180
            frac_y= src_y&(s-1);
1181
            src_x>>=shift;
1182
            src_y>>=shift;
1183

    
1184
            if((unsigned)src_x < width){
1185
                if((unsigned)src_y < height){
1186
                    index= src_x + src_y*stride;
1187
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1188
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1189
                                        + (  src[index+stride  ]*(s-frac_x)
1190
                                           + src[index+stride+1]*   frac_x )*   frac_y
1191
                                        + r)>>(shift*2);
1192
                }else{
1193
                    index= src_x + av_clip(src_y, 0, height)*stride;
1194
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1195
                                          + src[index       +1]*   frac_x )*s
1196
                                        + r)>>(shift*2);
1197
                }
1198
            }else{
1199
                if((unsigned)src_y < height){
1200
                    index= av_clip(src_x, 0, width) + src_y*stride;
1201
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1202
                                           + src[index+stride  ]*   frac_y )*s
1203
                                        + r)>>(shift*2);
1204
                }else{
1205
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1206
                    dst[y*stride + x]=    src[index         ];
1207
                }
1208
            }
1209

    
1210
            vx+= dxx;
1211
            vy+= dyx;
1212
        }
1213
        ox += dxy;
1214
        oy += dyy;
1215
    }
1216
}
1217

    
1218
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1219
    switch(width){
1220
    case 2: put_pixels2_c (dst, src, stride, height); break;
1221
    case 4: put_pixels4_c (dst, src, stride, height); break;
1222
    case 8: put_pixels8_c (dst, src, stride, height); break;
1223
    case 16:put_pixels16_c(dst, src, stride, height); break;
1224
    }
1225
}
1226

    
1227
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228
    int i,j;
1229
    for (i=0; i < height; i++) {
1230
      for (j=0; j < width; j++) {
1231
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1232
      }
1233
      src += stride;
1234
      dst += stride;
1235
    }
1236
}
1237

    
1238
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239
    int i,j;
1240
    for (i=0; i < height; i++) {
1241
      for (j=0; j < width; j++) {
1242
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1243
      }
1244
      src += stride;
1245
      dst += stride;
1246
    }
1247
}
1248

    
1249
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250
    int i,j;
1251
    for (i=0; i < height; i++) {
1252
      for (j=0; j < width; j++) {
1253
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1254
      }
1255
      src += stride;
1256
      dst += stride;
1257
    }
1258
}
1259

    
1260
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261
    int i,j;
1262
    for (i=0; i < height; i++) {
1263
      for (j=0; j < width; j++) {
1264
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1265
      }
1266
      src += stride;
1267
      dst += stride;
1268
    }
1269
}
1270

    
1271
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272
    int i,j;
1273
    for (i=0; i < height; i++) {
1274
      for (j=0; j < width; j++) {
1275
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1276
      }
1277
      src += stride;
1278
      dst += stride;
1279
    }
1280
}
1281

    
1282
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283
    int i,j;
1284
    for (i=0; i < height; i++) {
1285
      for (j=0; j < width; j++) {
1286
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1287
      }
1288
      src += stride;
1289
      dst += stride;
1290
    }
1291
}
1292

    
1293
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294
    int i,j;
1295
    for (i=0; i < height; i++) {
1296
      for (j=0; j < width; j++) {
1297
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1298
      }
1299
      src += stride;
1300
      dst += stride;
1301
    }
1302
}
1303

    
1304
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305
    int i,j;
1306
    for (i=0; i < height; i++) {
1307
      for (j=0; j < width; j++) {
1308
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1309
      }
1310
      src += stride;
1311
      dst += stride;
1312
    }
1313
}
1314

    
1315
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1316
    switch(width){
1317
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1318
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1319
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1320
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1321
    }
1322
}
1323

    
1324
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325
    int i,j;
1326
    for (i=0; i < height; i++) {
1327
      for (j=0; j < width; j++) {
1328
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1329
      }
1330
      src += stride;
1331
      dst += stride;
1332
    }
1333
}
1334

    
1335
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336
    int i,j;
1337
    for (i=0; i < height; i++) {
1338
      for (j=0; j < width; j++) {
1339
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1340
      }
1341
      src += stride;
1342
      dst += stride;
1343
    }
1344
}
1345

    
1346
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347
    int i,j;
1348
    for (i=0; i < height; i++) {
1349
      for (j=0; j < width; j++) {
1350
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1351
      }
1352
      src += stride;
1353
      dst += stride;
1354
    }
1355
}
1356

    
1357
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358
    int i,j;
1359
    for (i=0; i < height; i++) {
1360
      for (j=0; j < width; j++) {
1361
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1362
      }
1363
      src += stride;
1364
      dst += stride;
1365
    }
1366
}
1367

    
1368
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369
    int i,j;
1370
    for (i=0; i < height; i++) {
1371
      for (j=0; j < width; j++) {
1372
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1373
      }
1374
      src += stride;
1375
      dst += stride;
1376
    }
1377
}
1378

    
1379
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380
    int i,j;
1381
    for (i=0; i < height; i++) {
1382
      for (j=0; j < width; j++) {
1383
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1384
      }
1385
      src += stride;
1386
      dst += stride;
1387
    }
1388
}
1389

    
1390
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391
    int i,j;
1392
    for (i=0; i < height; i++) {
1393
      for (j=0; j < width; j++) {
1394
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1395
      }
1396
      src += stride;
1397
      dst += stride;
1398
    }
1399
}
1400

    
1401
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402
    int i,j;
1403
    for (i=0; i < height; i++) {
1404
      for (j=0; j < width; j++) {
1405
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1406
      }
1407
      src += stride;
1408
      dst += stride;
1409
    }
1410
}
1411
#if 0
1412
#define TPEL_WIDTH(width)\
1413
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1431
#endif
1432

    
1433
#define H264_CHROMA_MC(OPNAME, OP)\
1434
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435
    const int A=(8-x)*(8-y);\
1436
    const int B=(  x)*(8-y);\
1437
    const int C=(8-x)*(  y);\
1438
    const int D=(  x)*(  y);\
1439
    int i;\
1440
    \
1441
    assert(x<8 && y<8 && x>=0 && y>=0);\
1442
\
1443
    if(D){\
1444
        for(i=0; i<h; i++){\
1445
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1446
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1447
            dst+= stride;\
1448
            src+= stride;\
1449
        }\
1450
    }else{\
1451
        const int E= B+C;\
1452
        const int step= C ? stride : 1;\
1453
        for(i=0; i<h; i++){\
1454
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1455
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1456
            dst+= stride;\
1457
            src+= stride;\
1458
        }\
1459
    }\
1460
}\
1461
\
1462
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1463
    const int A=(8-x)*(8-y);\
1464
    const int B=(  x)*(8-y);\
1465
    const int C=(8-x)*(  y);\
1466
    const int D=(  x)*(  y);\
1467
    int i;\
1468
    \
1469
    assert(x<8 && y<8 && x>=0 && y>=0);\
1470
\
1471
    if(D){\
1472
        for(i=0; i<h; i++){\
1473
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1474
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1475
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1476
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1477
            dst+= stride;\
1478
            src+= stride;\
1479
        }\
1480
    }else{\
1481
        const int E= B+C;\
1482
        const int step= C ? stride : 1;\
1483
        for(i=0; i<h; i++){\
1484
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1485
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1486
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1487
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1488
            dst+= stride;\
1489
            src+= stride;\
1490
        }\
1491
    }\
1492
}\
1493
\
1494
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1495
    const int A=(8-x)*(8-y);\
1496
    const int B=(  x)*(8-y);\
1497
    const int C=(8-x)*(  y);\
1498
    const int D=(  x)*(  y);\
1499
    int i;\
1500
    \
1501
    assert(x<8 && y<8 && x>=0 && y>=0);\
1502
\
1503
    if(D){\
1504
        for(i=0; i<h; i++){\
1505
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1506
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1507
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1508
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1509
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1510
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1511
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1512
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1513
            dst+= stride;\
1514
            src+= stride;\
1515
        }\
1516
    }else{\
1517
        const int E= B+C;\
1518
        const int step= C ? stride : 1;\
1519
        for(i=0; i<h; i++){\
1520
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1521
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1522
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1523
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1524
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1525
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1526
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1527
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1528
            dst+= stride;\
1529
            src+= stride;\
1530
        }\
1531
    }\
1532
}
1533

    
1534
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1535
#define op_put(a, b) a = (((b) + 32)>>6)
1536

    
1537
H264_CHROMA_MC(put_       , op_put)
1538
H264_CHROMA_MC(avg_       , op_avg)
1539
#undef op_avg
1540
#undef op_put
1541

    
1542
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1543
    const int A=(8-x)*(8-y);
1544
    const int B=(  x)*(8-y);
1545
    const int C=(8-x)*(  y);
1546
    const int D=(  x)*(  y);
1547
    int i;
1548

    
1549
    assert(x<8 && y<8 && x>=0 && y>=0);
1550

    
1551
    for(i=0; i<h; i++)
1552
    {
1553
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1554
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1555
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1556
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1557
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1558
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1559
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1560
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1561
        dst+= stride;
1562
        src+= stride;
1563
    }
1564
}
1565

    
1566
#define QPEL_MC(r, OPNAME, RND, OP) \
1567
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1568
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1569
    int i;\
1570
    for(i=0; i<h; i++)\
1571
    {\
1572
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1573
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1574
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1575
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1576
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1577
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1578
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1579
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1580
        dst+=dstStride;\
1581
        src+=srcStride;\
1582
    }\
1583
}\
1584
\
1585
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1586
    const int w=8;\
1587
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1588
    int i;\
1589
    for(i=0; i<w; i++)\
1590
    {\
1591
        const int src0= src[0*srcStride];\
1592
        const int src1= src[1*srcStride];\
1593
        const int src2= src[2*srcStride];\
1594
        const int src3= src[3*srcStride];\
1595
        const int src4= src[4*srcStride];\
1596
        const int src5= src[5*srcStride];\
1597
        const int src6= src[6*srcStride];\
1598
        const int src7= src[7*srcStride];\
1599
        const int src8= src[8*srcStride];\
1600
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1601
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1602
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1603
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1604
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1605
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1606
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1607
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1608
        dst++;\
1609
        src++;\
1610
    }\
1611
}\
1612
\
1613
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1614
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1615
    int i;\
1616
    \
1617
    for(i=0; i<h; i++)\
1618
    {\
1619
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1620
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1621
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1622
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1623
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1624
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1625
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1626
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1627
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1628
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1629
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1630
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1631
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1632
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1633
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1634
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1635
        dst+=dstStride;\
1636
        src+=srcStride;\
1637
    }\
1638
}\
1639
\
1640
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1641
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1642
    int i;\
1643
    const int w=16;\
1644
    for(i=0; i<w; i++)\
1645
    {\
1646
        const int src0= src[0*srcStride];\
1647
        const int src1= src[1*srcStride];\
1648
        const int src2= src[2*srcStride];\
1649
        const int src3= src[3*srcStride];\
1650
        const int src4= src[4*srcStride];\
1651
        const int src5= src[5*srcStride];\
1652
        const int src6= src[6*srcStride];\
1653
        const int src7= src[7*srcStride];\
1654
        const int src8= src[8*srcStride];\
1655
        const int src9= src[9*srcStride];\
1656
        const int src10= src[10*srcStride];\
1657
        const int src11= src[11*srcStride];\
1658
        const int src12= src[12*srcStride];\
1659
        const int src13= src[13*srcStride];\
1660
        const int src14= src[14*srcStride];\
1661
        const int src15= src[15*srcStride];\
1662
        const int src16= src[16*srcStride];\
1663
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1664
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1665
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1666
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1667
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1668
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1669
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1670
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1671
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1672
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1673
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1674
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1675
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1676
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1677
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1678
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1679
        dst++;\
1680
        src++;\
1681
    }\
1682
}\
1683
\
1684
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1685
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1686
}\
1687
\
1688
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1689
    uint8_t half[64];\
1690
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1692
}\
1693
\
1694
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1695
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1696
}\
1697
\
1698
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1699
    uint8_t half[64];\
1700
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1701
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1702
}\
1703
\
1704
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1705
    uint8_t full[16*9];\
1706
    uint8_t half[64];\
1707
    copy_block9(full, src, 16, stride, 9);\
1708
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1709
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1710
}\
1711
\
1712
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[16*9];\
1714
    copy_block9(full, src, 16, stride, 9);\
1715
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1716
}\
1717
\
1718
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1719
    uint8_t full[16*9];\
1720
    uint8_t half[64];\
1721
    copy_block9(full, src, 16, stride, 9);\
1722
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1723
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1724
}\
1725
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t full[16*9];\
1727
    uint8_t halfH[72];\
1728
    uint8_t halfV[64];\
1729
    uint8_t halfHV[64];\
1730
    copy_block9(full, src, 16, stride, 9);\
1731
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1733
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1735
}\
1736
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1737
    uint8_t full[16*9];\
1738
    uint8_t halfH[72];\
1739
    uint8_t halfHV[64];\
1740
    copy_block9(full, src, 16, stride, 9);\
1741
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1742
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1743
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1744
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1745
}\
1746
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1747
    uint8_t full[16*9];\
1748
    uint8_t halfH[72];\
1749
    uint8_t halfV[64];\
1750
    uint8_t halfHV[64];\
1751
    copy_block9(full, src, 16, stride, 9);\
1752
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1754
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1756
}\
1757
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1758
    uint8_t full[16*9];\
1759
    uint8_t halfH[72];\
1760
    uint8_t halfHV[64];\
1761
    copy_block9(full, src, 16, stride, 9);\
1762
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1763
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1764
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1765
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1766
}\
1767
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1768
    uint8_t full[16*9];\
1769
    uint8_t halfH[72];\
1770
    uint8_t halfV[64];\
1771
    uint8_t halfHV[64];\
1772
    copy_block9(full, src, 16, stride, 9);\
1773
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1775
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1777
}\
1778
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1779
    uint8_t full[16*9];\
1780
    uint8_t halfH[72];\
1781
    uint8_t halfHV[64];\
1782
    copy_block9(full, src, 16, stride, 9);\
1783
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1784
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1785
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1786
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1787
}\
1788
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1789
    uint8_t full[16*9];\
1790
    uint8_t halfH[72];\
1791
    uint8_t halfV[64];\
1792
    uint8_t halfHV[64];\
1793
    copy_block9(full, src, 16, stride, 9);\
1794
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1795
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1796
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1798
}\
1799
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1800
    uint8_t full[16*9];\
1801
    uint8_t halfH[72];\
1802
    uint8_t halfHV[64];\
1803
    copy_block9(full, src, 16, stride, 9);\
1804
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1805
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1806
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1807
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1808
}\
1809
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1810
    uint8_t halfH[72];\
1811
    uint8_t halfHV[64];\
1812
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1813
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1814
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1815
}\
1816
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1817
    uint8_t halfH[72];\
1818
    uint8_t halfHV[64];\
1819
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1821
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1822
}\
1823
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1824
    uint8_t full[16*9];\
1825
    uint8_t halfH[72];\
1826
    uint8_t halfV[64];\
1827
    uint8_t halfHV[64];\
1828
    copy_block9(full, src, 16, stride, 9);\
1829
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1830
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1831
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1832
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1833
}\
1834
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1835
    uint8_t full[16*9];\
1836
    uint8_t halfH[72];\
1837
    copy_block9(full, src, 16, stride, 9);\
1838
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1840
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1841
}\
1842
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1843
    uint8_t full[16*9];\
1844
    uint8_t halfH[72];\
1845
    uint8_t halfV[64];\
1846
    uint8_t halfHV[64];\
1847
    copy_block9(full, src, 16, stride, 9);\
1848
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1850
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1852
}\
1853
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1854
    uint8_t full[16*9];\
1855
    uint8_t halfH[72];\
1856
    copy_block9(full, src, 16, stride, 9);\
1857
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1858
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1859
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1860
}\
1861
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1862
    uint8_t halfH[72];\
1863
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1864
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1865
}\
1866
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1867
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1868
}\
1869
\
1870
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1871
    uint8_t half[256];\
1872
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1874
}\
1875
\
1876
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1877
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1878
}\
1879
\
1880
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1881
    uint8_t half[256];\
1882
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1883
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1884
}\
1885
\
1886
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t full[24*17];\
1888
    uint8_t half[256];\
1889
    copy_block17(full, src, 24, stride, 17);\
1890
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1891
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1892
}\
1893
\
1894
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1895
    uint8_t full[24*17];\
1896
    copy_block17(full, src, 24, stride, 17);\
1897
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1898
}\
1899
\
1900
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1901
    uint8_t full[24*17];\
1902
    uint8_t half[256];\
1903
    copy_block17(full, src, 24, stride, 17);\
1904
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1905
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1906
}\
1907
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908
    uint8_t full[24*17];\
1909
    uint8_t halfH[272];\
1910
    uint8_t halfV[256];\
1911
    uint8_t halfHV[256];\
1912
    copy_block17(full, src, 24, stride, 17);\
1913
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1915
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1917
}\
1918
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1919
    uint8_t full[24*17];\
1920
    uint8_t halfH[272];\
1921
    uint8_t halfHV[256];\
1922
    copy_block17(full, src, 24, stride, 17);\
1923
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1924
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1925
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1926
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1927
}\
1928
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929
    uint8_t full[24*17];\
1930
    uint8_t halfH[272];\
1931
    uint8_t halfV[256];\
1932
    uint8_t halfHV[256];\
1933
    copy_block17(full, src, 24, stride, 17);\
1934
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1936
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1938
}\
1939
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1940
    uint8_t full[24*17];\
1941
    uint8_t halfH[272];\
1942
    uint8_t halfHV[256];\
1943
    copy_block17(full, src, 24, stride, 17);\
1944
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1945
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1946
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1947
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1948
}\
1949
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1950
    uint8_t full[24*17];\
1951
    uint8_t halfH[272];\
1952
    uint8_t halfV[256];\
1953
    uint8_t halfHV[256];\
1954
    copy_block17(full, src, 24, stride, 17);\
1955
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1957
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1959
}\
1960
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1961
    uint8_t full[24*17];\
1962
    uint8_t halfH[272];\
1963
    uint8_t halfHV[256];\
1964
    copy_block17(full, src, 24, stride, 17);\
1965
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1966
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1967
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1968
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1969
}\
1970
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1971
    uint8_t full[24*17];\
1972
    uint8_t halfH[272];\
1973
    uint8_t halfV[256];\
1974
    uint8_t halfHV[256];\
1975
    copy_block17(full, src, 24, stride, 17);\
1976
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1977
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1978
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1980
}\
1981
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1982
    uint8_t full[24*17];\
1983
    uint8_t halfH[272];\
1984
    uint8_t halfHV[256];\
1985
    copy_block17(full, src, 24, stride, 17);\
1986
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1988
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1990
}\
1991
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1992
    uint8_t halfH[272];\
1993
    uint8_t halfHV[256];\
1994
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1995
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1996
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1997
}\
1998
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1999
    uint8_t halfH[272];\
2000
    uint8_t halfHV[256];\
2001
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2003
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2004
}\
2005
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2006
    uint8_t full[24*17];\
2007
    uint8_t halfH[272];\
2008
    uint8_t halfV[256];\
2009
    uint8_t halfHV[256];\
2010
    copy_block17(full, src, 24, stride, 17);\
2011
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2012
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2013
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2014
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2015
}\
2016
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2017
    uint8_t full[24*17];\
2018
    uint8_t halfH[272];\
2019
    copy_block17(full, src, 24, stride, 17);\
2020
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2022
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2023
}\
2024
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2025
    uint8_t full[24*17];\
2026
    uint8_t halfH[272];\
2027
    uint8_t halfV[256];\
2028
    uint8_t halfHV[256];\
2029
    copy_block17(full, src, 24, stride, 17);\
2030
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2031
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2032
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2034
}\
2035
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2036
    uint8_t full[24*17];\
2037
    uint8_t halfH[272];\
2038
    copy_block17(full, src, 24, stride, 17);\
2039
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2040
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2041
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2042
}\
2043
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2044
    uint8_t halfH[272];\
2045
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2046
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2047
}
2048

    
2049
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2050
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2051
#define op_put(a, b) a = cm[((b) + 16)>>5]
2052
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2053

    
2054
QPEL_MC(0, put_       , _       , op_put)
2055
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2056
QPEL_MC(0, avg_       , _       , op_avg)
2057
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2058
#undef op_avg
2059
#undef op_avg_no_rnd
2060
#undef op_put
2061
#undef op_put_no_rnd
2062

    
2063
#if 1
2064
#define H264_LOWPASS(OPNAME, OP, OP2) \
2065
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2066
    const int h=2;\
2067
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2068
    int i;\
2069
    for(i=0; i<h; i++)\
2070
    {\
2071
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2072
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2073
        dst+=dstStride;\
2074
        src+=srcStride;\
2075
    }\
2076
}\
2077
\
2078
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2079
    const int w=2;\
2080
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2081
    int i;\
2082
    for(i=0; i<w; i++)\
2083
    {\
2084
        const int srcB= src[-2*srcStride];\
2085
        const int srcA= src[-1*srcStride];\
2086
        const int src0= src[0 *srcStride];\
2087
        const int src1= src[1 *srcStride];\
2088
        const int src2= src[2 *srcStride];\
2089
        const int src3= src[3 *srcStride];\
2090
        const int src4= src[4 *srcStride];\
2091
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2092
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2093
        dst++;\
2094
        src++;\
2095
    }\
2096
}\
2097
\
2098
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2099
    const int h=2;\
2100
    const int w=2;\
2101
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2102
    int i;\
2103
    src -= 2*srcStride;\
2104
    for(i=0; i<h+5; i++)\
2105
    {\
2106
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2107
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2108
        tmp+=tmpStride;\
2109
        src+=srcStride;\
2110
    }\
2111
    tmp -= tmpStride*(h+5-2);\
2112
    for(i=0; i<w; i++)\
2113
    {\
2114
        const int tmpB= tmp[-2*tmpStride];\
2115
        const int tmpA= tmp[-1*tmpStride];\
2116
        const int tmp0= tmp[0 *tmpStride];\
2117
        const int tmp1= tmp[1 *tmpStride];\
2118
        const int tmp2= tmp[2 *tmpStride];\
2119
        const int tmp3= tmp[3 *tmpStride];\
2120
        const int tmp4= tmp[4 *tmpStride];\
2121
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2123
        dst++;\
2124
        tmp++;\
2125
    }\
2126
}\
2127
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2128
    const int h=4;\
2129
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2130
    int i;\
2131
    for(i=0; i<h; i++)\
2132
    {\
2133
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2134
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2135
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2136
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2137
        dst+=dstStride;\
2138
        src+=srcStride;\
2139
    }\
2140
}\
2141
\
2142
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2143
    const int w=4;\
2144
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2145
    int i;\
2146
    for(i=0; i<w; i++)\
2147
    {\
2148
        const int srcB= src[-2*srcStride];\
2149
        const int srcA= src[-1*srcStride];\
2150
        const int src0= src[0 *srcStride];\
2151
        const int src1= src[1 *srcStride];\
2152
        const int src2= src[2 *srcStride];\
2153
        const int src3= src[3 *srcStride];\
2154
        const int src4= src[4 *srcStride];\
2155
        const int src5= src[5 *srcStride];\
2156
        const int src6= src[6 *srcStride];\
2157
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2158
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2159
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2160
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2161
        dst++;\
2162
        src++;\
2163
    }\
2164
}\
2165
\
2166
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2167
    const int h=4;\
2168
    const int w=4;\
2169
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170
    int i;\
2171
    src -= 2*srcStride;\
2172
    for(i=0; i<h+5; i++)\
2173
    {\
2174
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2175
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2176
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2177
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2178
        tmp+=tmpStride;\
2179
        src+=srcStride;\
2180
    }\
2181
    tmp -= tmpStride*(h+5-2);\
2182
    for(i=0; i<w; i++)\
2183
    {\
2184
        const int tmpB= tmp[-2*tmpStride];\
2185
        const int tmpA= tmp[-1*tmpStride];\
2186
        const int tmp0= tmp[0 *tmpStride];\
2187
        const int tmp1= tmp[1 *tmpStride];\
2188
        const int tmp2= tmp[2 *tmpStride];\
2189
        const int tmp3= tmp[3 *tmpStride];\
2190
        const int tmp4= tmp[4 *tmpStride];\
2191
        const int tmp5= tmp[5 *tmpStride];\
2192
        const int tmp6= tmp[6 *tmpStride];\
2193
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2194
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2195
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2196
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2197
        dst++;\
2198
        tmp++;\
2199
    }\
2200
}\
2201
\
2202
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2203
    const int h=8;\
2204
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2205
    int i;\
2206
    for(i=0; i<h; i++)\
2207
    {\
2208
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2209
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2210
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2211
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2212
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2213
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2214
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2215
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2216
        dst+=dstStride;\
2217
        src+=srcStride;\
2218
    }\
2219
}\
2220
\
2221
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2222
    const int w=8;\
2223
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2224
    int i;\
2225
    for(i=0; i<w; i++)\
2226
    {\
2227
        const int srcB= src[-2*srcStride];\
2228
        const int srcA= src[-1*srcStride];\
2229
        const int src0= src[0 *srcStride];\
2230
        const int src1= src[1 *srcStride];\
2231
        const int src2= src[2 *srcStride];\
2232
        const int src3= src[3 *srcStride];\
2233
        const int src4= src[4 *srcStride];\
2234
        const int src5= src[5 *srcStride];\
2235
        const int src6= src[6 *srcStride];\
2236
        const int src7= src[7 *srcStride];\
2237
        const int src8= src[8 *srcStride];\
2238
        const int src9= src[9 *srcStride];\
2239
        const int src10=src[10*srcStride];\
2240
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2241
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2242
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2243
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2244
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2245
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2246
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2247
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2248
        dst++;\
2249
        src++;\
2250
    }\
2251
}\
2252
\
2253
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2254
    const int h=8;\
2255
    const int w=8;\
2256
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2257
    int i;\
2258
    src -= 2*srcStride;\
2259
    for(i=0; i<h+5; i++)\
2260
    {\
2261
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2262
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2263
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2264
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2265
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2266
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2267
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2268
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2269
        tmp+=tmpStride;\
2270
        src+=srcStride;\
2271
    }\
2272
    tmp -= tmpStride*(h+5-2);\
2273
    for(i=0; i<w; i++)\
2274
    {\
2275
        const int tmpB= tmp[-2*tmpStride];\
2276
        const int tmpA= tmp[-1*tmpStride];\
2277
        const int tmp0= tmp[0 *tmpStride];\
2278
        const int tmp1= tmp[1 *tmpStride];\
2279
        const int tmp2= tmp[2 *tmpStride];\
2280
        const int tmp3= tmp[3 *tmpStride];\
2281
        const int tmp4= tmp[4 *tmpStride];\
2282
        const int tmp5= tmp[5 *tmpStride];\
2283
        const int tmp6= tmp[6 *tmpStride];\
2284
        const int tmp7= tmp[7 *tmpStride];\
2285
        const int tmp8= tmp[8 *tmpStride];\
2286
        const int tmp9= tmp[9 *tmpStride];\
2287
        const int tmp10=tmp[10*tmpStride];\
2288
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2291
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2292
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2293
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2294
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2295
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2296
        dst++;\
2297
        tmp++;\
2298
    }\
2299
}\
2300
\
2301
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2302
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2303
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2304
    src += 8*srcStride;\
2305
    dst += 8*dstStride;\
2306
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2307
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2308
}\
2309
\
2310
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2311
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2312
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2313
    src += 8*srcStride;\
2314
    dst += 8*dstStride;\
2315
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2316
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2317
}\
2318
\
2319
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2320
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2321
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2322
    src += 8*srcStride;\
2323
    dst += 8*dstStride;\
2324
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2325
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2326
}\
2327

    
2328
#define H264_MC(OPNAME, SIZE) \
2329
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2330
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2331
}\
2332
\
2333
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2334
    uint8_t half[SIZE*SIZE];\
2335
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2336
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2337
}\
2338
\
2339
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2340
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2341
}\
2342
\
2343
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2344
    uint8_t half[SIZE*SIZE];\
2345
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2346
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2347
}\
2348
\
2349
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2350
    uint8_t full[SIZE*(SIZE+5)];\
2351
    uint8_t * const full_mid= full + SIZE*2;\
2352
    uint8_t half[SIZE*SIZE];\
2353
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2354
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2355
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2356
}\
2357
\
2358
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2359
    uint8_t full[SIZE*(SIZE+5)];\
2360
    uint8_t * const full_mid= full + SIZE*2;\
2361
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2362
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2363
}\
2364
\
2365
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2366
    uint8_t full[SIZE*(SIZE+5)];\
2367
    uint8_t * const full_mid= full + SIZE*2;\
2368
    uint8_t half[SIZE*SIZE];\
2369
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2370
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2371
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2372
}\
2373
\
2374
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2375
    uint8_t full[SIZE*(SIZE+5)];\
2376
    uint8_t * const full_mid= full + SIZE*2;\
2377
    uint8_t halfH[SIZE*SIZE];\
2378
    uint8_t halfV[SIZE*SIZE];\
2379
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2380
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2381
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2382
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2383
}\
2384
\
2385
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2386
    uint8_t full[SIZE*(SIZE+5)];\
2387
    uint8_t * const full_mid= full + SIZE*2;\
2388
    uint8_t halfH[SIZE*SIZE];\
2389
    uint8_t halfV[SIZE*SIZE];\
2390
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2391
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2392
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2393
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2394
}\
2395
\
2396
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2397
    uint8_t full[SIZE*(SIZE+5)];\
2398
    uint8_t * const full_mid= full + SIZE*2;\
2399
    uint8_t halfH[SIZE*SIZE];\
2400
    uint8_t halfV[SIZE*SIZE];\
2401
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2402
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2403
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2404
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2405
}\
2406
\
2407
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2408
    uint8_t full[SIZE*(SIZE+5)];\
2409
    uint8_t * const full_mid= full + SIZE*2;\
2410
    uint8_t halfH[SIZE*SIZE];\
2411
    uint8_t halfV[SIZE*SIZE];\
2412
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2413
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2414
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2415
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2416
}\
2417
\
2418
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2419
    int16_t tmp[SIZE*(SIZE+5)];\
2420
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2421
}\
2422
\
2423
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2424
    int16_t tmp[SIZE*(SIZE+5)];\
2425
    uint8_t halfH[SIZE*SIZE];\
2426
    uint8_t halfHV[SIZE*SIZE];\
2427
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2429
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2430
}\
2431
\
2432
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2433
    int16_t tmp[SIZE*(SIZE+5)];\
2434
    uint8_t halfH[SIZE*SIZE];\
2435
    uint8_t halfHV[SIZE*SIZE];\
2436
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2437
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2438
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2439
}\
2440
\
2441
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2442
    uint8_t full[SIZE*(SIZE+5)];\
2443
    uint8_t * const full_mid= full + SIZE*2;\
2444
    int16_t tmp[SIZE*(SIZE+5)];\
2445
    uint8_t halfV[SIZE*SIZE];\
2446
    uint8_t halfHV[SIZE*SIZE];\
2447
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2448
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2449
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2450
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2451
}\
2452
\
2453
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2454
    uint8_t full[SIZE*(SIZE+5)];\
2455
    uint8_t * const full_mid= full + SIZE*2;\
2456
    int16_t tmp[SIZE*(SIZE+5)];\
2457
    uint8_t halfV[SIZE*SIZE];\
2458
    uint8_t halfHV[SIZE*SIZE];\
2459
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2460
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2461
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2462
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2463
}\
2464

    
2465
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2466
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2467
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2468
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2469
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2470

    
2471
H264_LOWPASS(put_       , op_put, op2_put)
2472
H264_LOWPASS(avg_       , op_avg, op2_avg)
2473
H264_MC(put_, 2)
2474
H264_MC(put_, 4)
2475
H264_MC(put_, 8)
2476
H264_MC(put_, 16)
2477
H264_MC(avg_, 4)
2478
H264_MC(avg_, 8)
2479
H264_MC(avg_, 16)
2480

    
2481
#undef op_avg
2482
#undef op_put
2483
#undef op2_avg
2484
#undef op2_put
2485
#endif
2486

    
2487
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2488
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2489
#define H264_WEIGHT(W,H) \
2490
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2491
    int y; \
2492
    offset <<= log2_denom; \
2493
    if(log2_denom) offset += 1<<(log2_denom-1); \
2494
    for(y=0; y<H; y++, block += stride){ \
2495
        op_scale1(0); \
2496
        op_scale1(1); \
2497
        if(W==2) continue; \
2498
        op_scale1(2); \
2499
        op_scale1(3); \
2500
        if(W==4) continue; \
2501
        op_scale1(4); \
2502
        op_scale1(5); \
2503
        op_scale1(6); \
2504
        op_scale1(7); \
2505
        if(W==8) continue; \
2506
        op_scale1(8); \
2507
        op_scale1(9); \
2508
        op_scale1(10); \
2509
        op_scale1(11); \
2510
        op_scale1(12); \
2511
        op_scale1(13); \
2512
        op_scale1(14); \
2513
        op_scale1(15); \
2514
    } \
2515
} \
2516
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2517
    int y; \
2518
    offset = ((offset + 1) | 1) << log2_denom; \
2519
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2520
        op_scale2(0); \
2521
        op_scale2(1); \
2522
        if(W==2) continue; \
2523
        op_scale2(2); \
2524
        op_scale2(3); \
2525
        if(W==4) continue; \
2526
        op_scale2(4); \
2527
        op_scale2(5); \
2528
        op_scale2(6); \
2529
        op_scale2(7); \
2530
        if(W==8) continue; \
2531
        op_scale2(8); \
2532
        op_scale2(9); \
2533
        op_scale2(10); \
2534
        op_scale2(11); \
2535
        op_scale2(12); \
2536
        op_scale2(13); \
2537
        op_scale2(14); \
2538
        op_scale2(15); \
2539
    } \
2540
}
2541

    
2542
H264_WEIGHT(16,16)
2543
H264_WEIGHT(16,8)
2544
H264_WEIGHT(8,16)
2545
H264_WEIGHT(8,8)
2546
H264_WEIGHT(8,4)
2547
H264_WEIGHT(4,8)
2548
H264_WEIGHT(4,4)
2549
H264_WEIGHT(4,2)
2550
H264_WEIGHT(2,4)
2551
H264_WEIGHT(2,2)
2552

    
2553
#undef op_scale1
2554
#undef op_scale2
2555
#undef H264_WEIGHT
2556

    
2557
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2558
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2559
    int i;
2560

    
2561
    for(i=0; i<h; i++){
2562
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2563
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2564
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2565
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2566
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2567
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2568
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2569
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2570
        dst+=dstStride;
2571
        src+=srcStride;
2572
    }
2573
}
2574

    
2575
#ifdef CONFIG_CAVS_DECODER
2576
/* AVS specific */
2577
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2578

    
2579
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580
    put_pixels8_c(dst, src, stride, 8);
2581
}
2582
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583
    avg_pixels8_c(dst, src, stride, 8);
2584
}
2585
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586
    put_pixels16_c(dst, src, stride, 16);
2587
}
2588
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589
    avg_pixels16_c(dst, src, stride, 16);
2590
}
2591
#endif /* CONFIG_CAVS_DECODER */
2592

    
2593
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2594
/* VC-1 specific */
2595
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2596

    
2597
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2598
    put_pixels8_c(dst, src, stride, 8);
2599
}
2600
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2601

    
2602
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2603

    
2604
/* H264 specific */
2605
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2606

    
2607
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2608
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2609
    int i;
2610

    
2611
    for(i=0; i<w; i++){
2612
        const int src_1= src[ -srcStride];
2613
        const int src0 = src[0          ];
2614
        const int src1 = src[  srcStride];
2615
        const int src2 = src[2*srcStride];
2616
        const int src3 = src[3*srcStride];
2617
        const int src4 = src[4*srcStride];
2618
        const int src5 = src[5*srcStride];
2619
        const int src6 = src[6*srcStride];
2620
        const int src7 = src[7*srcStride];
2621
        const int src8 = src[8*srcStride];
2622
        const int src9 = src[9*srcStride];
2623
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2624
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2625
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2626
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2627
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2628
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2629
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2630
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2631
        src++;
2632
        dst++;
2633
    }
2634
}
2635

    
2636
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2637
    put_pixels8_c(dst, src, stride, 8);
2638
}
2639

    
2640
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2641
    uint8_t half[64];
2642
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2643
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2644
}
2645

    
2646
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2647
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2648
}
2649

    
2650
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2651
    uint8_t half[64];
2652
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2653
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2654
}
2655

    
2656
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2657
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2658
}
2659

    
2660
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2661
    uint8_t halfH[88];
2662
    uint8_t halfV[64];
2663
    uint8_t halfHV[64];
2664
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2665
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2666
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2667
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2668
}
2669
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2670
    uint8_t halfH[88];
2671
    uint8_t halfV[64];
2672
    uint8_t halfHV[64];
2673
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2674
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2675
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2676
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2677
}
2678
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2679
    uint8_t halfH[88];
2680
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2681
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2682
}
2683

    
2684
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2685
    if(ENABLE_ANY_H263) {
2686
    int x;
2687
    const int strength= ff_h263_loop_filter_strength[qscale];
2688

    
2689
    for(x=0; x<8; x++){
2690
        int d1, d2, ad1;
2691
        int p0= src[x-2*stride];
2692
        int p1= src[x-1*stride];
2693
        int p2= src[x+0*stride];
2694
        int p3= src[x+1*stride];
2695
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2696

    
2697
        if     (d<-2*strength) d1= 0;
2698
        else if(d<-  strength) d1=-2*strength - d;
2699
        else if(d<   strength) d1= d;
2700
        else if(d< 2*strength) d1= 2*strength - d;
2701
        else                   d1= 0;
2702

    
2703
        p1 += d1;
2704
        p2 -= d1;
2705
        if(p1&256) p1= ~(p1>>31);
2706
        if(p2&256) p2= ~(p2>>31);
2707

    
2708
        src[x-1*stride] = p1;
2709
        src[x+0*stride] = p2;
2710

    
2711
        ad1= FFABS(d1)>>1;
2712

    
2713
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2714

    
2715
        src[x-2*stride] = p0 - d2;
2716
        src[x+  stride] = p3 + d2;
2717
    }
2718
    }
2719
}
2720

    
2721
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2722
    if(ENABLE_ANY_H263) {
2723
    int y;
2724
    const int strength= ff_h263_loop_filter_strength[qscale];
2725

    
2726
    for(y=0; y<8; y++){
2727
        int d1, d2, ad1;
2728
        int p0= src[y*stride-2];
2729
        int p1= src[y*stride-1];
2730
        int p2= src[y*stride+0];
2731
        int p3= src[y*stride+1];
2732
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2733

    
2734
        if     (d<-2*strength) d1= 0;
2735
        else if(d<-  strength) d1=-2*strength - d;
2736
        else if(d<   strength) d1= d;
2737
        else if(d< 2*strength) d1= 2*strength - d;
2738
        else                   d1= 0;
2739

    
2740
        p1 += d1;
2741
        p2 -= d1;
2742
        if(p1&256) p1= ~(p1>>31);
2743
        if(p2&256) p2= ~(p2>>31);
2744

    
2745
        src[y*stride-1] = p1;
2746
        src[y*stride+0] = p2;
2747

    
2748
        ad1= FFABS(d1)>>1;
2749

    
2750
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2751

    
2752
        src[y*stride-2] = p0 - d2;
2753
        src[y*stride+1] = p3 + d2;
2754
    }
2755
    }
2756
}
2757

    
2758
static void h261_loop_filter_c(uint8_t *src, int stride){
2759
    int x,y,xy,yz;
2760
    int temp[64];
2761

    
2762
    for(x=0; x<8; x++){
2763
        temp[x      ] = 4*src[x           ];
2764
        temp[x + 7*8] = 4*src[x + 7*stride];
2765
    }
2766
    for(y=1; y<7; y++){
2767
        for(x=0; x<8; x++){
2768
            xy = y * stride + x;
2769
            yz = y * 8 + x;
2770
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2771
        }
2772
    }
2773

    
2774
    for(y=0; y<8; y++){
2775
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2776
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2777
        for(x=1; x<7; x++){
2778
            xy = y * stride + x;
2779
            yz = y * 8 + x;
2780
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2781
        }
2782
    }
2783
}
2784

    
2785
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2786
{
2787
    int i, d;
2788
    for( i = 0; i < 4; i++ ) {
2789
        if( tc0[i] < 0 ) {
2790
            pix += 4*ystride;
2791
            continue;
2792
        }
2793
        for( d = 0; d < 4; d++ ) {
2794
            const int p0 = pix[-1*xstride];
2795
            const int p1 = pix[-2*xstride];
2796
            const int p2 = pix[-3*xstride];
2797
            const int q0 = pix[0];
2798
            const int q1 = pix[1*xstride];
2799
            const int q2 = pix[2*xstride];
2800

    
2801
            if( FFABS( p0 - q0 ) < alpha &&
2802
                FFABS( p1 - p0 ) < beta &&
2803
                FFABS( q1 - q0 ) < beta ) {
2804

    
2805
                int tc = tc0[i];
2806
                int i_delta;
2807

    
2808
                if( FFABS( p2 - p0 ) < beta ) {
2809
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2810
                    tc++;
2811
                }
2812
                if( FFABS( q2 - q0 ) < beta ) {
2813
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2814
                    tc++;
2815
                }
2816

    
2817
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2818
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2819
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2820
            }
2821
            pix += ystride;
2822
        }
2823
    }
2824
}
2825
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2826
{
2827
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2828
}
2829
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2830
{
2831
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2832
}
2833

    
2834
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2835
{
2836
    int i, d;
2837
    for( i = 0; i < 4; i++ ) {
2838
        const int tc = tc0[i];
2839
        if( tc <= 0 ) {
2840
            pix += 2*ystride;
2841
            continue;
2842
        }
2843
        for( d = 0; d < 2; d++ ) {
2844
            const int p0 = pix[-1*xstride];
2845
            const int p1 = pix[-2*xstride];
2846
            const int q0 = pix[0];
2847
            const int q1 = pix[1*xstride];
2848

    
2849
            if( FFABS( p0 - q0 ) < alpha &&
2850
                FFABS( p1 - p0 ) < beta &&
2851
                FFABS( q1 - q0 ) < beta ) {
2852

    
2853
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2854

    
2855
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2856
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2857
            }
2858
            pix += ystride;
2859
        }
2860
    }
2861
}
2862
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2863
{
2864
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2865
}
2866
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2867
{
2868
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2869
}
2870

    
2871
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2872
{
2873
    int d;
2874
    for( d = 0; d < 8; d++ ) {
2875
        const int p0 = pix[-1*xstride];
2876
        const int p1 = pix[-2*xstride];
2877
        const int q0 = pix[0];
2878
        const int q1 = pix[1*xstride];
2879

    
2880
        if( FFABS( p0 - q0 ) < alpha &&
2881
            FFABS( p1 - p0 ) < beta &&
2882
            FFABS( q1 - q0 ) < beta ) {
2883

    
2884
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2885
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2886
        }
2887
        pix += ystride;
2888
    }
2889
}
2890
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2891
{
2892
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2893
}
2894
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2895
{
2896
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2897
}
2898

    
2899
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2900
{
2901
    int s, i;
2902

    
2903
    s = 0;
2904
    for(i=0;i<h;i++) {
2905
        s += abs(pix1[0] - pix2[0]);
2906
        s += abs(pix1[1] - pix2[1]);
2907
        s += abs(pix1[2] - pix2[2]);
2908
        s += abs(pix1[3] - pix2[3]);
2909
        s += abs(pix1[4] - pix2[4]);
2910
        s += abs(pix1[5] - pix2[5]);
2911
        s += abs(pix1[6] - pix2[6]);
2912
        s += abs(pix1[7] - pix2[7]);
2913
        s += abs(pix1[8] - pix2[8]);
2914
        s += abs(pix1[9] - pix2[9]);
2915
        s += abs(pix1[10] - pix2[10]);
2916
        s += abs(pix1[11] - pix2[11]);
2917
        s += abs(pix1[12] - pix2[12]);
2918
        s += abs(pix1[13] - pix2[13]);
2919
        s += abs(pix1[14] - pix2[14]);
2920
        s += abs(pix1[15] - pix2[15]);
2921
        pix1 += line_size;
2922
        pix2 += line_size;
2923
    }
2924
    return s;
2925
}
2926

    
2927
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928
{
2929
    int s, i;
2930

    
2931
    s = 0;
2932
    for(i=0;i<h;i++) {
2933
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2942
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2943
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2944
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2945
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2946
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2947
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2948
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2949
        pix1 += line_size;
2950
        pix2 += line_size;
2951
    }
2952
    return s;
2953
}
2954

    
2955
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2956
{
2957
    int s, i;
2958
    uint8_t *pix3 = pix2 + line_size;
2959

    
2960
    s = 0;
2961
    for(i=0;i<h;i++) {
2962
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2963
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2964
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2965
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2966
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2967
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2968
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2969
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2970
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2971
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2972
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2973
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2974
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2975
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2976
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2977
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2978
        pix1 += line_size;
2979
        pix2 += line_size;
2980
        pix3 += line_size;
2981
    }
2982
    return s;
2983
}
2984

    
2985
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2986
{
2987
    int s, i;
2988
    uint8_t *pix3 = pix2 + line_size;
2989

    
2990
    s = 0;
2991
    for(i=0;i<h;i++) {
2992
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2993
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2994
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2995
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2996
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2997
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2998
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2999
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3000
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3001
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3002
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3003
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3004
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3005
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3006
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3007
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3008
        pix1 += line_size;
3009
        pix2 += line_size;
3010
        pix3 += line_size;
3011
    }
3012
    return s;
3013
}
3014

    
3015
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3016
{
3017
    int s, i;
3018

    
3019
    s = 0;
3020
    for(i=0;i<h;i++) {
3021
        s += abs(pix1[0] - pix2[0]);
3022
        s += abs(pix1[1] - pix2[1]);
3023
        s += abs(pix1[2] - pix2[2]);
3024
        s += abs(pix1[3] - pix2[3]);
3025
        s += abs(pix1[4] - pix2[4]);
3026
        s += abs(pix1[5] - pix2[5]);
3027
        s += abs(pix1[6] - pix2[6]);
3028
        s += abs(pix1[7] - pix2[7]);
3029
        pix1 += line_size;
3030
        pix2 += line_size;
3031
    }
3032
    return s;
3033
}
3034

    
3035
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036
{
3037
    int s, i;
3038

    
3039
    s = 0;
3040
    for(i=0;i<h;i++) {
3041
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3042
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3043
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3044
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3045
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3046
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3047
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3048
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3049
        pix1 += line_size;
3050
        pix2 += line_size;
3051
    }
3052
    return s;
3053
}
3054

    
3055
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3056
{
3057
    int s, i;
3058
    uint8_t *pix3 = pix2 + line_size;
3059

    
3060
    s = 0;
3061
    for(i=0;i<h;i++) {
3062
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3063
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3064
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3065
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3066
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3067
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3068
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3069
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3070
        pix1 += line_size;
3071
        pix2 += line_size;
3072
        pix3 += line_size;
3073
    }
3074
    return s;
3075
}
3076

    
3077
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3078
{
3079
    int s, i;
3080
    uint8_t *pix3 = pix2 + line_size;
3081

    
3082
    s = 0;
3083
    for(i=0;i<h;i++) {
3084
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3085
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3086
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3087
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3088
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3089
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3090
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3091
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3092
        pix1 += line_size;
3093
        pix2 += line_size;
3094
        pix3 += line_size;
3095
    }
3096
    return s;
3097
}
3098

    
3099
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3100
    MpegEncContext *c = v;
3101
    int score1=0;
3102
    int score2=0;
3103
    int x,y;
3104

    
3105
    for(y=0; y<h; y++){
3106
        for(x=0; x<16; x++){
3107
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3108
        }
3109
        if(y+1<h){
3110
            for(x=0; x<15; x++){
3111
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3112
                             - s1[x+1] + s1[x+1+stride])
3113
                        -FFABS(  s2[x  ] - s2[x  +stride]
3114
                             - s2[x+1] + s2[x+1+stride]);
3115
            }
3116
        }
3117
        s1+= stride;
3118
        s2+= stride;
3119
    }
3120

    
3121
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3122
    else  return score1 + FFABS(score2)*8;
3123
}
3124

    
3125
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3126
    MpegEncContext *c = v;
3127
    int score1=0;
3128
    int score2=0;
3129
    int x,y;
3130

    
3131
    for(y=0; y<h; y++){
3132
        for(x=0; x<8; x++){
3133
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3134
        }
3135
        if(y+1<h){
3136
            for(x=0; x<7; x++){
3137
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3138
                             - s1[x+1] + s1[x+1+stride])
3139
                        -FFABS(  s2[x  ] - s2[x  +stride]
3140
                             - s2[x+1] + s2[x+1+stride]);
3141
            }
3142
        }
3143
        s1+= stride;
3144
        s2+= stride;
3145
    }
3146

    
3147
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3148
    else  return score1 + FFABS(score2)*8;
3149
}
3150

    
3151
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3152
    int i;
3153
    unsigned int sum=0;
3154

    
3155
    for(i=0; i<8*8; i++){
3156
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3157
        int w= weight[i];
3158
        b>>= RECON_SHIFT;
3159
        assert(-512<b && b<512);
3160

    
3161
        sum += (w*b)*(w*b)>>4;
3162
    }
3163
    return sum>>2;
3164
}
3165

    
3166
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3167
    int i;
3168

    
3169
    for(i=0; i<8*8; i++){
3170
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3171
    }
3172
}
3173

    
3174
/**
3175
 * permutes an 8x8 block.
3176
 * @param block the block which will be permuted according to the given permutation vector
3177
 * @param permutation the permutation vector
3178
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3179
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3180
 *                  (inverse) permutated to scantable order!
3181
 */
3182
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3183
{
3184
    int i;
3185
    DCTELEM temp[64];
3186

    
3187
    if(last<=0) return;
3188
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3189

    
3190
    for(i=0; i<=last; i++){
3191
        const int j= scantable[i];
3192
        temp[j]= block[j];
3193
        block[j]=0;
3194
    }
3195

    
3196
    for(i=0; i<=last; i++){
3197
        const int j= scantable[i];
3198
        const int perm_j= permutation[j];
3199
        block[perm_j]= temp[j];
3200
    }
3201
}
3202

    
3203
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3204
    return 0;
3205
}
3206

    
3207
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3208
    int i;
3209

    
3210
    memset(cmp, 0, sizeof(void*)*5);
3211

    
3212
    for(i=0; i<5; i++){
3213
        switch(type&0xFF){
3214
        case FF_CMP_SAD:
3215
            cmp[i]= c->sad[i];
3216
            break;
3217
        case FF_CMP_SATD:
3218
            cmp[i]= c->hadamard8_diff[i];
3219
            break;
3220
        case FF_CMP_SSE:
3221
            cmp[i]= c->sse[i];
3222
            break;
3223
        case FF_CMP_DCT:
3224
            cmp[i]= c->dct_sad[i];
3225
            break;
3226
        case FF_CMP_DCT264:
3227
            cmp[i]= c->dct264_sad[i];
3228
            break;
3229
        case FF_CMP_DCTMAX:
3230
            cmp[i]= c->dct_max[i];
3231
            break;
3232
        case FF_CMP_PSNR:
3233
            cmp[i]= c->quant_psnr[i];
3234
            break;
3235
        case FF_CMP_BIT:
3236
            cmp[i]= c->bit[i];
3237
            break;
3238
        case FF_CMP_RD:
3239
            cmp[i]= c->rd[i];
3240
            break;
3241
        case FF_CMP_VSAD:
3242
            cmp[i]= c->vsad[i];
3243
            break;
3244
        case FF_CMP_VSSE:
3245
            cmp[i]= c->vsse[i];
3246
            break;
3247
        case FF_CMP_ZERO:
3248
            cmp[i]= zero_cmp;
3249
            break;
3250
        case FF_CMP_NSSE:
3251
            cmp[i]= c->nsse[i];
3252
            break;
3253
#ifdef CONFIG_SNOW_ENCODER
3254
        case FF_CMP_W53:
3255
            cmp[i]= c->w53[i];
3256
            break;
3257
        case FF_CMP_W97:
3258
            cmp[i]= c->w97[i];
3259
            break;
3260
#endif
3261
        default:
3262
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3263
        }
3264
    }
3265
}
3266

    
3267
/**
3268
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3269
 */
3270
static void clear_blocks_c(DCTELEM *blocks)
3271
{
3272
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3273
}
3274

    
3275
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3276
    int i;
3277
    for(i=0; i+7<w; i+=8){
3278
        dst[i+0] += src[i+0];
3279
        dst[i+1] += src[i+1];
3280
        dst[i+2] += src[i+2];
3281
        dst[i+3] += src[i+3];
3282
        dst[i+4] += src[i+4];
3283
        dst[i+5] += src[i+5];
3284
        dst[i+6] += src[i+6];
3285
        dst[i+7] += src[i+7];
3286
    }
3287
    for(; i<w; i++)
3288
        dst[i+0] += src[i+0];
3289
}
3290

    
3291
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3292
    int i;
3293
    for(i=0; i+7<w; i+=8){
3294
        dst[i+0] = src1[i+0]-src2[i+0];
3295
        dst[i+1] = src1[i+1]-src2[i+1];
3296
        dst[i+2] = src1[i+2]-src2[i+2];
3297
        dst[i+3] = src1[i+3]-src2[i+3];
3298
        dst[i+4] = src1[i+4]-src2[i+4];
3299
        dst[i+5] = src1[i+5]-src2[i+5];
3300
        dst[i+6] = src1[i+6]-src2[i+6];
3301
        dst[i+7] = src1[i+7]-src2[i+7];
3302
    }
3303
    for(; i<w; i++)
3304
        dst[i+0] = src1[i+0]-src2[i+0];
3305
}
3306

    
3307
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3308
    int i;
3309
    uint8_t l, lt;
3310

    
3311
    l= *left;
3312
    lt= *left_top;
3313

    
3314
    for(i=0; i<w; i++){
3315
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3316
        lt= src1[i];
3317
        l= src2[i];
3318
        dst[i]= l - pred;
3319
    }
3320

    
3321
    *left= l;
3322
    *left_top= lt;
3323
}
3324

    
3325
#define BUTTERFLY2(o1,o2,i1,i2) \
3326
o1= (i1)+(i2);\
3327
o2= (i1)-(i2);
3328

    
3329
#define BUTTERFLY1(x,y) \
3330
{\
3331
    int a,b;\
3332
    a= x;\
3333
    b= y;\
3334
    x= a+b;\
3335
    y= a-b;\
3336
}
3337

    
3338
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3339

    
3340
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3341
    int i;
3342
    int temp[64];
3343
    int sum=0;
3344

    
3345
    assert(h==8);
3346

    
3347
    for(i=0; i<8; i++){
3348
        //FIXME try pointer walks
3349
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3350
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3351
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3352
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3353

    
3354
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3355
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3356
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3357
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3358

    
3359
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3360
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3361
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3362
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3363
    }
3364

    
3365
    for(i=0; i<8; i++){
3366
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3367
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3368
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3369
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3370

    
3371
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3372
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3373
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3374
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3375

    
3376
        sum +=
3377
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3378
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3379
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3380
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3381
    }
3382
#if 0
3383
static int maxi=0;
3384
if(sum>maxi){
3385
    maxi=sum;
3386
    printf("MAX:%d\n", maxi);
3387
}
3388
#endif
3389
    return sum;
3390
}
3391

    
3392
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3393
    int i;
3394
    int temp[64];
3395
    int sum=0;
3396

    
3397
    assert(h==8);
3398

    
3399
    for(i=0; i<8; i++){
3400
        //FIXME try pointer walks
3401
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3402
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3403
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3404
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3405

    
3406
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3407
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3408
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3409
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3410

    
3411
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3412
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3413
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3414
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3415
    }
3416

    
3417
    for(i=0; i<8; i++){
3418
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3419
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3420
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3421
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3422

    
3423
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3424
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3425
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3426
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3427

    
3428
        sum +=
3429
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3430
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3431
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3432
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3433
    }
3434

    
3435
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3436

    
3437
    return sum;
3438
}
3439

    
3440
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3441
    MpegEncContext * const s= (MpegEncContext *)c;
3442
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3443
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3444

    
3445
    assert(h==8);
3446

    
3447
    s->dsp.diff_pixels(temp, src1, src2, stride);
3448
    s->dsp.fdct(temp);
3449
    return s->dsp.sum_abs_dctelem(temp);
3450
}
3451

    
3452
#ifdef CONFIG_GPL
3453
#define DCT8_1D {\
3454
    const int s07 = SRC(0) + SRC(7);\
3455
    const int s16 = SRC(1) + SRC(6);\
3456
    const int s25 = SRC(2) + SRC(5);\
3457
    const int s34 = SRC(3) + SRC(4);\
3458
    const int a0 = s07 + s34;\
3459
    const int a1 = s16 + s25;\
3460
    const int a2 = s07 - s34;\
3461
    const int a3 = s16 - s25;\
3462
    const int d07 = SRC(0) - SRC(7);\
3463
    const int d16 = SRC(1) - SRC(6);\
3464
    const int d25 = SRC(2) - SRC(5);\
3465
    const int d34 = SRC(3) - SRC(4);\
3466
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3467
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3468
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3469
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3470
    DST(0,  a0 + a1     ) ;\
3471
    DST(1,  a4 + (a7>>2)) ;\
3472
    DST(2,  a2 + (a3>>1)) ;\
3473
    DST(3,  a5 + (a6>>2)) ;\
3474
    DST(4,  a0 - a1     ) ;\
3475
    DST(5,  a6 - (a5>>2)) ;\
3476
    DST(6, (a2>>1) - a3 ) ;\
3477
    DST(7, (a4>>2) - a7 ) ;\
3478
}
3479

    
3480
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3481
    MpegEncContext * const s= (MpegEncContext *)c;
3482
    DCTELEM dct[8][8];
3483
    int i;
3484
    int sum=0;
3485

    
3486
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3487

    
3488
#define SRC(x) dct[i][x]
3489
#define DST(x,v) dct[i][x]= v
3490
    for( i = 0; i < 8; i++ )
3491
        DCT8_1D
3492
#undef SRC
3493
#undef DST
3494

    
3495
#define SRC(x) dct[x][i]
3496
#define DST(x,v) sum += FFABS(v)
3497
    for( i = 0; i < 8; i++ )
3498
        DCT8_1D
3499
#undef SRC
3500
#undef DST
3501
    return sum;
3502
}
3503
#endif
3504

    
3505
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3506
    MpegEncContext * const s= (MpegEncContext *)c;
3507
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3508
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3509
    int sum=0, i;
3510

    
3511
    assert(h==8);
3512

    
3513
    s->dsp.diff_pixels(temp, src1, src2, stride);
3514
    s->dsp.fdct(temp);
3515

    
3516
    for(i=0; i<64; i++)
3517
        sum= FFMAX(sum, FFABS(temp[i]));
3518

    
3519
    return sum;
3520
}
3521

    
3522
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3523
    MpegEncContext * const s= (MpegEncContext *)c;
3524
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3525
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3526
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3527
    int sum=0, i;
3528

    
3529
    assert(h==8);
3530
    s->mb_intra=0;
3531

    
3532
    s->dsp.diff_pixels(temp, src1, src2, stride);
3533

    
3534
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3535

    
3536
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3537
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3538
    ff_simple_idct(temp); //FIXME
3539

    
3540
    for(i=0; i<64; i++)
3541
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3542

    
3543
    return sum;
3544
}
3545

    
3546
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3547
    MpegEncContext * const s= (MpegEncContext *)c;
3548
    const uint8_t *scantable= s->intra_scantable.permutated;
3549
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3550
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3551
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3552
    uint8_t * const bak= (uint8_t*)aligned_bak;
3553
    int i, last, run, bits, level, distoration, start_i;
3554
    const int esc_length= s->ac_esc_length;
3555
    uint8_t * length;
3556
    uint8_t * last_length;
3557

    
3558
    assert(h==8);
3559

    
3560
    for(i=0; i<8; i++){
3561
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3562
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3563
    }
3564

    
3565
    s->dsp.diff_pixels(temp, src1, src2, stride);
3566

    
3567
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3568

    
3569
    bits=0;
3570

    
3571
    if (s->mb_intra) {
3572
        start_i = 1;
3573
        length     = s->intra_ac_vlc_length;
3574
        last_length= s->intra_ac_vlc_last_length;
3575
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3576
    } else {
3577
        start_i = 0;
3578
        length     = s->inter_ac_vlc_length;
3579
        last_length= s->inter_ac_vlc_last_length;
3580
    }
3581

    
3582
    if(last>=start_i){
3583
        run=0;
3584
        for(i=start_i; i<last; i++){
3585
            int j= scantable[i];
3586
            level= temp[j];
3587

    
3588
            if(level){
3589
                level+=64;
3590
                if((level&(~127)) == 0){
3591
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3592
                }else
3593
                    bits+= esc_length;
3594
                run=0;
3595
            }else
3596
                run++;
3597
        }
3598
        i= scantable[last];
3599

    
3600
        level= temp[i] + 64;
3601

    
3602
        assert(level - 64);
3603

    
3604
        if((level&(~127)) == 0){
3605
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3606
        }else
3607
            bits+= esc_length;
3608

    
3609
    }
3610

    
3611
    if(last>=0){
3612
        if(s->mb_intra)
3613
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3614
        else
3615
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3616
    }
3617

    
3618
    s->dsp.idct_add(bak, stride, temp);
3619

    
3620
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3621

    
3622
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3623
}
3624

    
3625
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3626
    MpegEncContext * const s= (MpegEncContext *)c;
3627
    const uint8_t *scantable= s->intra_scantable.permutated;
3628
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3629
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3630
    int i, last, run, bits, level, start_i;
3631
    const int esc_length= s->ac_esc_length;
3632
    uint8_t * length;
3633
    uint8_t * last_length;
3634

    
3635
    assert(h==8);
3636

    
3637
    s->dsp.diff_pixels(temp, src1, src2, stride);
3638

    
3639
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3640

    
3641
    bits=0;
3642

    
3643
    if (s->mb_intra) {
3644
        start_i = 1;
3645
        length     = s->intra_ac_vlc_length;
3646
        last_length= s->intra_ac_vlc_last_length;
3647
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3648
    } else {
3649
        start_i = 0;
3650
        length     = s->inter_ac_vlc_length;
3651
        last_length= s->inter_ac_vlc_last_length;
3652
    }
3653

    
3654
    if(last>=start_i){
3655
        run=0;
3656
        for(i=start_i; i<last; i++){
3657
            int j= scantable[i];
3658
            level= temp[j];
3659

    
3660
            if(level){
3661
                level+=64;
3662
                if((level&(~127)) == 0){
3663
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3664
                }else
3665
                    bits+= esc_length;
3666
                run=0;
3667
            }else
3668
                run++;
3669
        }
3670
        i= scantable[last];
3671

    
3672
        level= temp[i] + 64;
3673

    
3674
        assert(level - 64);
3675

    
3676
        if((level&(~127)) == 0){
3677
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3678
        }else
3679
            bits+= esc_length;
3680
    }
3681

    
3682
    return bits;
3683
}
3684

    
3685
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3686
    int score=0;
3687
    int x,y;
3688

    
3689
    for(y=1; y<h; y++){
3690
        for(x=0; x<16; x+=4){
3691
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3692
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3693
        }
3694
        s+= stride;
3695
    }
3696

    
3697
    return score;
3698
}
3699

    
3700
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3701
    int score=0;
3702
    int x,y;
3703

    
3704
    for(y=1; y<h; y++){
3705
        for(x=0; x<16; x++){
3706
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3707
        }
3708
        s1+= stride;
3709
        s2+= stride;
3710
    }
3711

    
3712
    return score;
3713
}
3714

    
3715
#define SQ(a) ((a)*(a))
3716
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3717
    int score=0;
3718
    int x,y;
3719

    
3720
    for(y=1; y<h; y++){
3721
        for(x=0; x<16; x+=4){
3722
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3723
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3724
        }
3725
        s+= stride;
3726
    }
3727

    
3728
    return score;
3729
}
3730

    
3731
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3732
    int score=0;
3733
    int x,y;
3734

    
3735
    for(y=1; y<h; y++){
3736
        for(x=0; x<16; x++){
3737
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3738
        }
3739
        s1+= stride;
3740
        s2+= stride;
3741
    }
3742

    
3743
    return score;
3744
}
3745

    
3746
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3747
                               int size){
3748
    int score=0;
3749
    int i;
3750
    for(i=0; i<size; i++)
3751
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3752
    return score;
3753
}
3754

    
3755
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3756
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3757
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3758
#ifdef CONFIG_GPL
3759
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3760
#endif
3761
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3762
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3763
WARPER8_16_SQ(rd8x8_c, rd16_c)
3764
WARPER8_16_SQ(bit8x8_c, bit16_c)
3765

    
3766
static void vector_fmul_c(float *dst, const float *src, int len){
3767
    int i;
3768
    for(i=0; i<len; i++)
3769
        dst[i] *= src[i];
3770
}
3771

    
3772
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3773
    int i;
3774
    src1 += len-1;
3775
    for(i=0; i<len; i++)
3776
        dst[i] = src0[i] * src1[-i];
3777
}
3778

    
3779
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3780
    int i;
3781
    for(i=0; i<len; i++)
3782
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3783
}
3784

    
3785
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3786
    int i;
3787
    for(i=0; i<len; i++) {
3788
        int_fast32_t tmp = ((int32_t*)src)[i];
3789
        if(tmp & 0xf0000){
3790
            tmp = (0x43c0ffff - tmp)>>31;
3791
            // is this faster on some gcc/cpu combinations?
3792
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3793
//          else                 tmp = 0;
3794
        }
3795
        dst[i] = tmp - 0x8000;
3796
    }
3797
}
3798

    
3799
#define W0 2048
3800
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3801
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3802
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3803
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3804
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3805
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3806
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3807

    
3808
static void wmv2_idct_row(short * b)
3809
{
3810
    int s1,s2;
3811
    int a0,a1,a2,a3,a4,a5,a6,a7;
3812
    /*step 1*/
3813
    a1 = W1*b[1]+W7*b[7];
3814
    a7 = W7*b[1]-W1*b[7];
3815
    a5 = W5*b[5]+W3*b[3];
3816
    a3 = W3*b[5]-W5*b[3];
3817
    a2 = W2*b[2]+W6*b[6];
3818
    a6 = W6*b[2]-W2*b[6];
3819
    a0 = W0*b[0]+W0*b[4];
3820
    a4 = W0*b[0]-W0*b[4];
3821
    /*step 2*/
3822
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3823
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3824
    /*step 3*/
3825
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3826
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3827
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3828
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3829
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3830
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3831
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3832
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3833
}
3834
static void wmv2_idct_col(short * b)
3835
{
3836
    int s1,s2;
3837
    int a0,a1,a2,a3,a4,a5,a6,a7;
3838
    /*step 1, with extended precision*/
3839
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3840
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3841
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3842
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3843
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3844
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3845
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3846
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3847
    /*step 2*/
3848
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3849
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3850
    /*step 3*/
3851
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3852
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3853
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3854
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3855

    
3856
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3857
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3858
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3859
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3860
}
3861
void ff_wmv2_idct_c(short * block){
3862
    int i;
3863

    
3864
    for(i=0;i<64;i+=8){
3865
        wmv2_idct_row(block+i);
3866
    }
3867
    for(i=0;i<8;i++){
3868
        wmv2_idct_col(block+i);
3869
    }
3870
}
3871
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3872
 converted */
3873
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3874
{
3875
    ff_wmv2_idct_c(block);
3876
    put_pixels_clamped_c(block, dest, line_size);
3877
}
3878
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3879
{
3880
    ff_wmv2_idct_c(block);
3881
    add_pixels_clamped_c(block, dest, line_size);
3882
}
3883
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3884
{
3885
    j_rev_dct (block);
3886
    put_pixels_clamped_c(block, dest, line_size);
3887
}
3888
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3889
{
3890
    j_rev_dct (block);
3891
    add_pixels_clamped_c(block, dest, line_size);
3892
}
3893

    
3894
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3895
{
3896
    j_rev_dct4 (block);
3897
    put_pixels_clamped4_c(block, dest, line_size);
3898
}
3899
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3900
{
3901
    j_rev_dct4 (block);
3902
    add_pixels_clamped4_c(block, dest, line_size);
3903
}
3904

    
3905
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3906
{
3907
    j_rev_dct2 (block);
3908
    put_pixels_clamped2_c(block, dest, line_size);
3909
}
3910
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3911
{
3912
    j_rev_dct2 (block);
3913
    add_pixels_clamped2_c(block, dest, line_size);
3914
}
3915

    
3916
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3917
{
3918
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3919

    
3920
    dest[0] = cm[(block[0] + 4)>>3];
3921
}
3922
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3923
{
3924
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3925

    
3926
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3927
}
3928

    
3929
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3930

    
3931
/* init static data */
3932
void dsputil_static_init(void)
3933
{
3934
    int i;
3935

    
3936
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3937
    for(i=0;i<MAX_NEG_CROP;i++) {
3938
        ff_cropTbl[i] = 0;
3939
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3940
    }
3941

    
3942
    for(i=0;i<512;i++) {
3943
        ff_squareTbl[i] = (i - 256) * (i - 256);
3944
    }
3945

    
3946
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3947
}
3948

    
3949
int ff_check_alignment(void){
3950
    static int did_fail=0;
3951
    DECLARE_ALIGNED_16(int, aligned);
3952

    
3953
    if((long)&aligned & 15){
3954
        if(!did_fail){
3955
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3956
            av_log(NULL, AV_LOG_ERROR,
3957
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3958
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3959
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3960
                "Do not report crashes to FFmpeg developers.\n");
3961
#endif
3962
            did_fail=1;
3963
        }
3964
        return -1;
3965
    }
3966
    return 0;
3967
}
3968

    
3969
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3970
{
3971
    int i;
3972

    
3973
    ff_check_alignment();
3974

    
3975
#ifdef CONFIG_ENCODERS
3976
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3977
        c->fdct = fdct_ifast;
3978
        c->fdct248 = fdct_ifast248;
3979
    }
3980
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3981
        c->fdct = ff_faandct;
3982
        c->fdct248 = ff_faandct248;
3983
    }
3984
    else {
3985
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3986
        c->fdct248 = ff_fdct248_islow;
3987
    }
3988
#endif //CONFIG_ENCODERS
3989

    
3990
    if(avctx->lowres==1){
3991
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3992
            c->idct_put= ff_jref_idct4_put;
3993
            c->idct_add= ff_jref_idct4_add;
3994
        }else{
3995
            c->idct_put= ff_h264_lowres_idct_put_c;
3996
            c->idct_add= ff_h264_lowres_idct_add_c;
3997
        }
3998
        c->idct    = j_rev_dct4;
3999
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4000
    }else if(avctx->lowres==2){
4001
        c->idct_put= ff_jref_idct2_put;
4002
        c->idct_add= ff_jref_idct2_add;
4003
        c->idct    = j_rev_dct2;
4004
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4005
    }else if(avctx->lowres==3){
4006
        c->idct_put= ff_jref_idct1_put;
4007
        c->idct_add= ff_jref_idct1_add;
4008
        c->idct    = j_rev_dct1;
4009
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4010
    }else{
4011
        if(avctx->idct_algo==FF_IDCT_INT){
4012
            c->idct_put= ff_jref_idct_put;
4013
            c->idct_add= ff_jref_idct_add;
4014
            c->idct    = j_rev_dct;
4015
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4016
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4017
                avctx->idct_algo==FF_IDCT_VP3){
4018
            c->idct_put= ff_vp3_idct_put_c;
4019
            c->idct_add= ff_vp3_idct_add_c;
4020
            c->idct    = ff_vp3_idct_c;
4021
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4022
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4023
            c->idct_put= ff_wmv2_idct_put_c;
4024
            c->idct_add= ff_wmv2_idct_add_c;
4025
            c->idct    = ff_wmv2_idct_c;
4026
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4027
        }else{ //accurate/default
4028
            c->idct_put= ff_simple_idct_put;
4029
            c->idct_add= ff_simple_idct_add;
4030
            c->idct    = ff_simple_idct;
4031
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4032
        }
4033
    }
4034

    
4035
    if (ENABLE_H264_DECODER) {
4036
        c->h264_idct_add= ff_h264_idct_add_c;
4037
        c->h264_idct8_add= ff_h264_idct8_add_c;
4038
        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4039
        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4040
    }
4041

    
4042
    c->get_pixels = get_pixels_c;
4043
    c->diff_pixels = diff_pixels_c;
4044
    c->put_pixels_clamped = put_pixels_clamped_c;
4045
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4046
    c->add_pixels_clamped = add_pixels_clamped_c;
4047
    c->add_pixels8 = add_pixels8_c;
4048
    c->add_pixels4 = add_pixels4_c;
4049
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4050
    c->gmc1 = gmc1_c;
4051
    c->gmc = ff_gmc_c;
4052
    c->clear_blocks = clear_blocks_c;
4053
    c->pix_sum = pix_sum_c;
4054
    c->pix_norm1 = pix_norm1_c;
4055

    
4056
    /* TODO [0] 16  [1] 8 */
4057
    c->pix_abs[0][0] = pix_abs16_c;
4058
    c->pix_abs[0][1] = pix_abs16_x2_c;
4059
    c->pix_abs[0][2] = pix_abs16_y2_c;
4060
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4061
    c->pix_abs[1][0] = pix_abs8_c;
4062
    c->pix_abs[1][1] = pix_abs8_x2_c;
4063
    c->pix_abs[1][2] = pix_abs8_y2_c;
4064
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4065

    
4066
#define dspfunc(PFX, IDX, NUM) \
4067
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4068
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4069
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4070
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4071

    
4072
    dspfunc(put, 0, 16);
4073
    dspfunc(put_no_rnd, 0, 16);
4074
    dspfunc(put, 1, 8);
4075
    dspfunc(put_no_rnd, 1, 8);
4076
    dspfunc(put, 2, 4);
4077
    dspfunc(put, 3, 2);
4078

    
4079
    dspfunc(avg, 0, 16);
4080
    dspfunc(avg_no_rnd, 0, 16);
4081
    dspfunc(avg, 1, 8);
4082
    dspfunc(avg_no_rnd, 1, 8);
4083
    dspfunc(avg, 2, 4);
4084
    dspfunc(avg, 3, 2);
4085
#undef dspfunc
4086

    
4087
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4088
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4089

    
4090
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4091
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4092
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4093
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4094
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4095
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4096
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4097
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4098
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4099

    
4100
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4101
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4102
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4103
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4104
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4105
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4106
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4107
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4108
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4109

    
4110
#define dspfunc(PFX, IDX, NUM) \
4111
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4112
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4113
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4114
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4115
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4116
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4117
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4118
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4119
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4120
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4121
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4122
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4123
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4124
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4125
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4126
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4127

    
4128
    dspfunc(put_qpel, 0, 16);
4129
    dspfunc(put_no_rnd_qpel, 0, 16);
4130

    
4131
    dspfunc(avg_qpel, 0, 16);
4132
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4133

    
4134
    dspfunc(put_qpel, 1, 8);
4135
    dspfunc(put_no_rnd_qpel, 1, 8);
4136

    
4137
    dspfunc(avg_qpel, 1, 8);
4138
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4139

    
4140
    dspfunc(put_h264_qpel, 0, 16);
4141
    dspfunc(put_h264_qpel, 1, 8);
4142
    dspfunc(put_h264_qpel, 2, 4);
4143
    dspfunc(put_h264_qpel, 3, 2);
4144
    dspfunc(avg_h264_qpel, 0, 16);
4145
    dspfunc(avg_h264_qpel, 1, 8);
4146
    dspfunc(avg_h264_qpel, 2, 4);
4147

    
4148
#undef dspfunc
4149
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4150
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4151
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4152
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4153
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4154
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4155
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4156

    
4157
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4158
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4159
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4160
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4161
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4162
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4163
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4164
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4165
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4166
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4167
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4168
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4169
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4170
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4171
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4172
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4173
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4174
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4175
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4176
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4177

    
4178
#ifdef CONFIG_CAVS_DECODER
4179
    ff_cavsdsp_init(c,avctx);
4180
#endif
4181
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4182
    ff_vc1dsp_init(c,avctx);
4183
#endif
4184
#if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4185
    ff_intrax8dsp_init(c,avctx);
4186
#endif
4187
#if defined(CONFIG_H264_ENCODER)
4188
    ff_h264dspenc_init(c,avctx);
4189
#endif
4190

    
4191
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4192
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4193
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4194
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4195
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4196
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4197
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4198
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4199

    
4200
#define SET_CMP_FUNC(name) \
4201
    c->name[0]= name ## 16_c;\
4202
    c->name[1]= name ## 8x8_c;
4203

    
4204
    SET_CMP_FUNC(hadamard8_diff)
4205
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4206
    SET_CMP_FUNC(dct_sad)
4207
    SET_CMP_FUNC(dct_max)
4208
#ifdef CONFIG_GPL
4209
    SET_CMP_FUNC(dct264_sad)
4210
#endif
4211
    c->sad[0]= pix_abs16_c;
4212
    c->sad[1]= pix_abs8_c;
4213
    c->sse[0]= sse16_c;
4214
    c->sse[1]= sse8_c;
4215
    c->sse[2]= sse4_c;
4216
    SET_CMP_FUNC(quant_psnr)
4217
    SET_CMP_FUNC(rd)
4218
    SET_CMP_FUNC(bit)
4219
    c->vsad[0]= vsad16_c;
4220
    c->vsad[4]= vsad_intra16_c;
4221
    c->vsse[0]= vsse16_c;
4222
    c->vsse[4]= vsse_intra16_c;
4223
    c->nsse[0]= nsse16_c;
4224
    c->nsse[1]= nsse8_c;
4225
#ifdef CONFIG_SNOW_ENCODER
4226
    c->w53[0]= w53_16_c;
4227
    c->w53[1]= w53_8_c;
<