Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 84705403

History | View | Annotate | Download (114 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27
#include "avcodec.h"
28
#include "dsputil.h"
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
31

    
32

    
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35

    
36
const uint8_t ff_zigzag_direct[64] = {
37
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39
    12, 19, 26, 33, 40, 48, 41, 34,
40
    27, 20, 13,  6,  7, 14, 21, 28,
41
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46

    
47
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48
uint16_t __align8 inv_zigzag_direct16[64];
49

    
50
const uint8_t ff_alternate_horizontal_scan[64] = {
51
    0,  1,   2,  3,  8,  9, 16, 17, 
52
    10, 11,  4,  5,  6,  7, 15, 14,
53
    13, 12, 19, 18, 24, 25, 32, 33, 
54
    26, 27, 20, 21, 22, 23, 28, 29,
55
    30, 31, 34, 35, 40, 41, 48, 49, 
56
    42, 43, 36, 37, 38, 39, 44, 45,
57
    46, 47, 50, 51, 56, 57, 58, 59, 
58
    52, 53, 54, 55, 60, 61, 62, 63,
59
};
60

    
61
const uint8_t ff_alternate_vertical_scan[64] = {
62
    0,  8,  16, 24,  1,  9,  2, 10, 
63
    17, 25, 32, 40, 48, 56, 57, 49,
64
    41, 33, 26, 18,  3, 11,  4, 12, 
65
    19, 27, 34, 42, 50, 58, 35, 43,
66
    51, 59, 20, 28,  5, 13,  6, 14, 
67
    21, 29, 36, 44, 52, 60, 37, 45,
68
    53, 61, 22, 30,  7, 15, 23, 31, 
69
    38, 46, 54, 62, 39, 47, 55, 63,
70
};
71

    
72
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73
const uint32_t inverse[256]={
74
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
75
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
76
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
77
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
78
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
79
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
80
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
81
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
82
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
83
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
84
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
85
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
86
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
87
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
88
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
89
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
90
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
91
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
92
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
93
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
94
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
95
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
96
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
97
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
98
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
99
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
100
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
101
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
102
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
103
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
104
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
105
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
106
};
107

    
108
/* Input permutation for the simple_idct_mmx */
109
static const uint8_t simple_mmx_permutation[64]={
110
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
111
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
112
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
113
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
114
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
115
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
116
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
117
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118
};
119

    
120
static int pix_sum_c(uint8_t * pix, int line_size)
121
{
122
    int s, i, j;
123

    
124
    s = 0;
125
    for (i = 0; i < 16; i++) {
126
        for (j = 0; j < 16; j += 8) {
127
            s += pix[0];
128
            s += pix[1];
129
            s += pix[2];
130
            s += pix[3];
131
            s += pix[4];
132
            s += pix[5];
133
            s += pix[6];
134
            s += pix[7];
135
            pix += 8;
136
        }
137
        pix += line_size - 16;
138
    }
139
    return s;
140
}
141

    
142
static int pix_norm1_c(uint8_t * pix, int line_size)
143
{
144
    int s, i, j;
145
    uint32_t *sq = squareTbl + 256;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
#if 0
151
            s += sq[pix[0]];
152
            s += sq[pix[1]];
153
            s += sq[pix[2]];
154
            s += sq[pix[3]];
155
            s += sq[pix[4]];
156
            s += sq[pix[5]];
157
            s += sq[pix[6]];
158
            s += sq[pix[7]];
159
#else
160
#if LONG_MAX > 2147483647
161
            register uint64_t x=*(uint64_t*)pix;
162
            s += sq[x&0xff];
163
            s += sq[(x>>8)&0xff];
164
            s += sq[(x>>16)&0xff];
165
            s += sq[(x>>24)&0xff];
166
            s += sq[(x>>32)&0xff];
167
            s += sq[(x>>40)&0xff];
168
            s += sq[(x>>48)&0xff];
169
            s += sq[(x>>56)&0xff];
170
#else
171
            register uint32_t x=*(uint32_t*)pix;
172
            s += sq[x&0xff];
173
            s += sq[(x>>8)&0xff];
174
            s += sq[(x>>16)&0xff];
175
            s += sq[(x>>24)&0xff];
176
            x=*(uint32_t*)(pix+4);
177
            s += sq[x&0xff];
178
            s += sq[(x>>8)&0xff];
179
            s += sq[(x>>16)&0xff];
180
            s += sq[(x>>24)&0xff];
181
#endif
182
#endif
183
            pix += 8;
184
        }
185
        pix += line_size - 16;
186
    }
187
    return s;
188
}
189

    
190
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
191
    int i;
192
    
193
    for(i=0; i+8<=w; i+=8){
194
        dst[i+0]= bswap_32(src[i+0]);
195
        dst[i+1]= bswap_32(src[i+1]);
196
        dst[i+2]= bswap_32(src[i+2]);
197
        dst[i+3]= bswap_32(src[i+3]);
198
        dst[i+4]= bswap_32(src[i+4]);
199
        dst[i+5]= bswap_32(src[i+5]);
200
        dst[i+6]= bswap_32(src[i+6]);
201
        dst[i+7]= bswap_32(src[i+7]);
202
    }
203
    for(;i<w; i++){
204
        dst[i+0]= bswap_32(src[i+0]);
205
    }
206
}
207

    
208
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
209
{
210
    int s, i;
211
    uint32_t *sq = squareTbl + 256;
212

    
213
    s = 0;
214
    for (i = 0; i < 8; i++) {
215
        s += sq[pix1[0] - pix2[0]];
216
        s += sq[pix1[1] - pix2[1]];
217
        s += sq[pix1[2] - pix2[2]];
218
        s += sq[pix1[3] - pix2[3]];
219
        s += sq[pix1[4] - pix2[4]];
220
        s += sq[pix1[5] - pix2[5]];
221
        s += sq[pix1[6] - pix2[6]];
222
        s += sq[pix1[7] - pix2[7]];
223
        pix1 += line_size;
224
        pix2 += line_size;
225
    }
226
    return s;
227
}
228

    
229
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
230
{
231
    int s, i;
232
    uint32_t *sq = squareTbl + 256;
233

    
234
    s = 0;
235
    for (i = 0; i < 16; i++) {
236
        s += sq[pix1[ 0] - pix2[ 0]];
237
        s += sq[pix1[ 1] - pix2[ 1]];
238
        s += sq[pix1[ 2] - pix2[ 2]];
239
        s += sq[pix1[ 3] - pix2[ 3]];
240
        s += sq[pix1[ 4] - pix2[ 4]];
241
        s += sq[pix1[ 5] - pix2[ 5]];
242
        s += sq[pix1[ 6] - pix2[ 6]];
243
        s += sq[pix1[ 7] - pix2[ 7]];
244
        s += sq[pix1[ 8] - pix2[ 8]];
245
        s += sq[pix1[ 9] - pix2[ 9]];
246
        s += sq[pix1[10] - pix2[10]];
247
        s += sq[pix1[11] - pix2[11]];
248
        s += sq[pix1[12] - pix2[12]];
249
        s += sq[pix1[13] - pix2[13]];
250
        s += sq[pix1[14] - pix2[14]];
251
        s += sq[pix1[15] - pix2[15]];
252

    
253
        pix1 += line_size;
254
        pix2 += line_size;
255
    }
256
    return s;
257
}
258

    
259
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
260
{
261
    int i;
262

    
263
    /* read the pixels */
264
    for(i=0;i<8;i++) {
265
        block[0] = pixels[0];
266
        block[1] = pixels[1];
267
        block[2] = pixels[2];
268
        block[3] = pixels[3];
269
        block[4] = pixels[4];
270
        block[5] = pixels[5];
271
        block[6] = pixels[6];
272
        block[7] = pixels[7];
273
        pixels += line_size;
274
        block += 8;
275
    }
276
}
277

    
278
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
279
                          const uint8_t *s2, int stride){
280
    int i;
281

    
282
    /* read the pixels */
283
    for(i=0;i<8;i++) {
284
        block[0] = s1[0] - s2[0];
285
        block[1] = s1[1] - s2[1];
286
        block[2] = s1[2] - s2[2];
287
        block[3] = s1[3] - s2[3];
288
        block[4] = s1[4] - s2[4];
289
        block[5] = s1[5] - s2[5];
290
        block[6] = s1[6] - s2[6];
291
        block[7] = s1[7] - s2[7];
292
        s1 += stride;
293
        s2 += stride;
294
        block += 8;
295
    }
296
}
297

    
298

    
299
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
300
                                 int line_size)
301
{
302
    int i;
303
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
304
    
305
    /* read the pixels */
306
    for(i=0;i<8;i++) {
307
        pixels[0] = cm[block[0]];
308
        pixels[1] = cm[block[1]];
309
        pixels[2] = cm[block[2]];
310
        pixels[3] = cm[block[3]];
311
        pixels[4] = cm[block[4]];
312
        pixels[5] = cm[block[5]];
313
        pixels[6] = cm[block[6]];
314
        pixels[7] = cm[block[7]];
315

    
316
        pixels += line_size;
317
        block += 8;
318
    }
319
}
320

    
321
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
322
                          int line_size)
323
{
324
    int i;
325
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
326
    
327
    /* read the pixels */
328
    for(i=0;i<8;i++) {
329
        pixels[0] = cm[pixels[0] + block[0]];
330
        pixels[1] = cm[pixels[1] + block[1]];
331
        pixels[2] = cm[pixels[2] + block[2]];
332
        pixels[3] = cm[pixels[3] + block[3]];
333
        pixels[4] = cm[pixels[4] + block[4]];
334
        pixels[5] = cm[pixels[5] + block[5]];
335
        pixels[6] = cm[pixels[6] + block[6]];
336
        pixels[7] = cm[pixels[7] + block[7]];
337
        pixels += line_size;
338
        block += 8;
339
    }
340
}
341
#if 0
342

343
#define PIXOP2(OPNAME, OP) \
344
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
345
{\
346
    int i;\
347
    for(i=0; i<h; i++){\
348
        OP(*((uint64_t*)block), LD64(pixels));\
349
        pixels+=line_size;\
350
        block +=line_size;\
351
    }\
352
}\
353
\
354
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
355
{\
356
    int i;\
357
    for(i=0; i<h; i++){\
358
        const uint64_t a= LD64(pixels  );\
359
        const uint64_t b= LD64(pixels+1);\
360
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
361
        pixels+=line_size;\
362
        block +=line_size;\
363
    }\
364
}\
365
\
366
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
367
{\
368
    int i;\
369
    for(i=0; i<h; i++){\
370
        const uint64_t a= LD64(pixels  );\
371
        const uint64_t b= LD64(pixels+1);\
372
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
373
        pixels+=line_size;\
374
        block +=line_size;\
375
    }\
376
}\
377
\
378
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
379
{\
380
    int i;\
381
    for(i=0; i<h; i++){\
382
        const uint64_t a= LD64(pixels          );\
383
        const uint64_t b= LD64(pixels+line_size);\
384
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
385
        pixels+=line_size;\
386
        block +=line_size;\
387
    }\
388
}\
389
\
390
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
391
{\
392
    int i;\
393
    for(i=0; i<h; i++){\
394
        const uint64_t a= LD64(pixels          );\
395
        const uint64_t b= LD64(pixels+line_size);\
396
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
397
        pixels+=line_size;\
398
        block +=line_size;\
399
    }\
400
}\
401
\
402
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
403
{\
404
        int i;\
405
        const uint64_t a= LD64(pixels  );\
406
        const uint64_t b= LD64(pixels+1);\
407
        uint64_t l0=  (a&0x0303030303030303ULL)\
408
                    + (b&0x0303030303030303ULL)\
409
                    + 0x0202020202020202ULL;\
410
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
411
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
412
        uint64_t l1,h1;\
413
\
414
        pixels+=line_size;\
415
        for(i=0; i<h; i+=2){\
416
            uint64_t a= LD64(pixels  );\
417
            uint64_t b= LD64(pixels+1);\
418
            l1=  (a&0x0303030303030303ULL)\
419
               + (b&0x0303030303030303ULL);\
420
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
421
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
422
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
423
            pixels+=line_size;\
424
            block +=line_size;\
425
            a= LD64(pixels  );\
426
            b= LD64(pixels+1);\
427
            l0=  (a&0x0303030303030303ULL)\
428
               + (b&0x0303030303030303ULL)\
429
               + 0x0202020202020202ULL;\
430
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
431
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
432
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
433
            pixels+=line_size;\
434
            block +=line_size;\
435
        }\
436
}\
437
\
438
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
439
{\
440
        int i;\
441
        const uint64_t a= LD64(pixels  );\
442
        const uint64_t b= LD64(pixels+1);\
443
        uint64_t l0=  (a&0x0303030303030303ULL)\
444
                    + (b&0x0303030303030303ULL)\
445
                    + 0x0101010101010101ULL;\
446
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
447
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
448
        uint64_t l1,h1;\
449
\
450
        pixels+=line_size;\
451
        for(i=0; i<h; i+=2){\
452
            uint64_t a= LD64(pixels  );\
453
            uint64_t b= LD64(pixels+1);\
454
            l1=  (a&0x0303030303030303ULL)\
455
               + (b&0x0303030303030303ULL);\
456
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
457
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
458
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
459
            pixels+=line_size;\
460
            block +=line_size;\
461
            a= LD64(pixels  );\
462
            b= LD64(pixels+1);\
463
            l0=  (a&0x0303030303030303ULL)\
464
               + (b&0x0303030303030303ULL)\
465
               + 0x0101010101010101ULL;\
466
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
467
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
468
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
469
            pixels+=line_size;\
470
            block +=line_size;\
471
        }\
472
}\
473
\
474
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
475
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
476
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
477
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
478
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
479
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
480
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
481

482
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
483
#else // 64 bit variant
484

    
485
#define PIXOP2(OPNAME, OP) \
486
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487
    int i;\
488
    for(i=0; i<h; i++){\
489
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
490
        pixels+=line_size;\
491
        block +=line_size;\
492
    }\
493
}\
494
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
495
    int i;\
496
    for(i=0; i<h; i++){\
497
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
498
        pixels+=line_size;\
499
        block +=line_size;\
500
    }\
501
}\
502
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
503
    int i;\
504
    for(i=0; i<h; i++){\
505
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
506
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
507
        pixels+=line_size;\
508
        block +=line_size;\
509
    }\
510
}\
511
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
512
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
513
}\
514
\
515
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
516
                                                int src_stride1, int src_stride2, int h){\
517
    int i;\
518
    for(i=0; i<h; i++){\
519
        uint32_t a,b;\
520
        a= LD32(&src1[i*src_stride1  ]);\
521
        b= LD32(&src2[i*src_stride2  ]);\
522
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
523
        a= LD32(&src1[i*src_stride1+4]);\
524
        b= LD32(&src2[i*src_stride2+4]);\
525
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
526
    }\
527
}\
528
\
529
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
                                                int src_stride1, int src_stride2, int h){\
531
    int i;\
532
    for(i=0; i<h; i++){\
533
        uint32_t a,b;\
534
        a= LD32(&src1[i*src_stride1  ]);\
535
        b= LD32(&src2[i*src_stride2  ]);\
536
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
537
        a= LD32(&src1[i*src_stride1+4]);\
538
        b= LD32(&src2[i*src_stride2+4]);\
539
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
540
    }\
541
}\
542
\
543
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544
                                                int src_stride1, int src_stride2, int h){\
545
    int i;\
546
    for(i=0; i<h; i++){\
547
        uint32_t a,b;\
548
        a= LD32(&src1[i*src_stride1  ]);\
549
        b= LD32(&src2[i*src_stride2  ]);\
550
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
551
    }\
552
}\
553
\
554
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
555
                                                int src_stride1, int src_stride2, int h){\
556
    int i;\
557
    for(i=0; i<h; i++){\
558
        uint32_t a,b;\
559
        a= LD16(&src1[i*src_stride1  ]);\
560
        b= LD16(&src2[i*src_stride2  ]);\
561
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
562
    }\
563
}\
564
\
565
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
566
                                                int src_stride1, int src_stride2, int h){\
567
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
568
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
569
}\
570
\
571
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
572
                                                int src_stride1, int src_stride2, int h){\
573
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
574
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
575
}\
576
\
577
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
578
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
579
}\
580
\
581
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
582
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
583
}\
584
\
585
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
586
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
587
}\
588
\
589
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
590
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
591
}\
592
\
593
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
594
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
595
    int i;\
596
    for(i=0; i<h; i++){\
597
        uint32_t a, b, c, d, l0, l1, h0, h1;\
598
        a= LD32(&src1[i*src_stride1]);\
599
        b= LD32(&src2[i*src_stride2]);\
600
        c= LD32(&src3[i*src_stride3]);\
601
        d= LD32(&src4[i*src_stride4]);\
602
        l0=  (a&0x03030303UL)\
603
           + (b&0x03030303UL)\
604
           + 0x02020202UL;\
605
        h0= ((a&0xFCFCFCFCUL)>>2)\
606
          + ((b&0xFCFCFCFCUL)>>2);\
607
        l1=  (c&0x03030303UL)\
608
           + (d&0x03030303UL);\
609
        h1= ((c&0xFCFCFCFCUL)>>2)\
610
          + ((d&0xFCFCFCFCUL)>>2);\
611
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
612
        a= LD32(&src1[i*src_stride1+4]);\
613
        b= LD32(&src2[i*src_stride2+4]);\
614
        c= LD32(&src3[i*src_stride3+4]);\
615
        d= LD32(&src4[i*src_stride4+4]);\
616
        l0=  (a&0x03030303UL)\
617
           + (b&0x03030303UL)\
618
           + 0x02020202UL;\
619
        h0= ((a&0xFCFCFCFCUL)>>2)\
620
          + ((b&0xFCFCFCFCUL)>>2);\
621
        l1=  (c&0x03030303UL)\
622
           + (d&0x03030303UL);\
623
        h1= ((c&0xFCFCFCFCUL)>>2)\
624
          + ((d&0xFCFCFCFCUL)>>2);\
625
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626
    }\
627
}\
628
\
629
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
630
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
631
}\
632
\
633
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
634
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
635
}\
636
\
637
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
638
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
639
}\
640
\
641
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
642
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
643
}\
644
\
645
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
646
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
647
    int i;\
648
    for(i=0; i<h; i++){\
649
        uint32_t a, b, c, d, l0, l1, h0, h1;\
650
        a= LD32(&src1[i*src_stride1]);\
651
        b= LD32(&src2[i*src_stride2]);\
652
        c= LD32(&src3[i*src_stride3]);\
653
        d= LD32(&src4[i*src_stride4]);\
654
        l0=  (a&0x03030303UL)\
655
           + (b&0x03030303UL)\
656
           + 0x01010101UL;\
657
        h0= ((a&0xFCFCFCFCUL)>>2)\
658
          + ((b&0xFCFCFCFCUL)>>2);\
659
        l1=  (c&0x03030303UL)\
660
           + (d&0x03030303UL);\
661
        h1= ((c&0xFCFCFCFCUL)>>2)\
662
          + ((d&0xFCFCFCFCUL)>>2);\
663
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
664
        a= LD32(&src1[i*src_stride1+4]);\
665
        b= LD32(&src2[i*src_stride2+4]);\
666
        c= LD32(&src3[i*src_stride3+4]);\
667
        d= LD32(&src4[i*src_stride4+4]);\
668
        l0=  (a&0x03030303UL)\
669
           + (b&0x03030303UL)\
670
           + 0x01010101UL;\
671
        h0= ((a&0xFCFCFCFCUL)>>2)\
672
          + ((b&0xFCFCFCFCUL)>>2);\
673
        l1=  (c&0x03030303UL)\
674
           + (d&0x03030303UL);\
675
        h1= ((c&0xFCFCFCFCUL)>>2)\
676
          + ((d&0xFCFCFCFCUL)>>2);\
677
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678
    }\
679
}\
680
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
683
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
684
}\
685
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
686
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
687
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
688
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
689
}\
690
\
691
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
692
{\
693
        int i, a0, b0, a1, b1;\
694
        a0= pixels[0];\
695
        b0= pixels[1] + 2;\
696
        a0 += b0;\
697
        b0 += pixels[2];\
698
\
699
        pixels+=line_size;\
700
        for(i=0; i<h; i+=2){\
701
            a1= pixels[0];\
702
            b1= pixels[1];\
703
            a1 += b1;\
704
            b1 += pixels[2];\
705
\
706
            block[0]= (a1+a0)>>2; /* FIXME non put */\
707
            block[1]= (b1+b0)>>2;\
708
\
709
            pixels+=line_size;\
710
            block +=line_size;\
711
\
712
            a0= pixels[0];\
713
            b0= pixels[1] + 2;\
714
            a0 += b0;\
715
            b0 += pixels[2];\
716
\
717
            block[0]= (a1+a0)>>2;\
718
            block[1]= (b1+b0)>>2;\
719
            pixels+=line_size;\
720
            block +=line_size;\
721
        }\
722
}\
723
\
724
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
725
{\
726
        int i;\
727
        const uint32_t a= LD32(pixels  );\
728
        const uint32_t b= LD32(pixels+1);\
729
        uint32_t l0=  (a&0x03030303UL)\
730
                    + (b&0x03030303UL)\
731
                    + 0x02020202UL;\
732
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
733
                   + ((b&0xFCFCFCFCUL)>>2);\
734
        uint32_t l1,h1;\
735
\
736
        pixels+=line_size;\
737
        for(i=0; i<h; i+=2){\
738
            uint32_t a= LD32(pixels  );\
739
            uint32_t b= LD32(pixels+1);\
740
            l1=  (a&0x03030303UL)\
741
               + (b&0x03030303UL);\
742
            h1= ((a&0xFCFCFCFCUL)>>2)\
743
              + ((b&0xFCFCFCFCUL)>>2);\
744
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
745
            pixels+=line_size;\
746
            block +=line_size;\
747
            a= LD32(pixels  );\
748
            b= LD32(pixels+1);\
749
            l0=  (a&0x03030303UL)\
750
               + (b&0x03030303UL)\
751
               + 0x02020202UL;\
752
            h0= ((a&0xFCFCFCFCUL)>>2)\
753
              + ((b&0xFCFCFCFCUL)>>2);\
754
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
755
            pixels+=line_size;\
756
            block +=line_size;\
757
        }\
758
}\
759
\
760
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761
{\
762
    int j;\
763
    for(j=0; j<2; j++){\
764
        int i;\
765
        const uint32_t a= LD32(pixels  );\
766
        const uint32_t b= LD32(pixels+1);\
767
        uint32_t l0=  (a&0x03030303UL)\
768
                    + (b&0x03030303UL)\
769
                    + 0x02020202UL;\
770
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
771
                   + ((b&0xFCFCFCFCUL)>>2);\
772
        uint32_t l1,h1;\
773
\
774
        pixels+=line_size;\
775
        for(i=0; i<h; i+=2){\
776
            uint32_t a= LD32(pixels  );\
777
            uint32_t b= LD32(pixels+1);\
778
            l1=  (a&0x03030303UL)\
779
               + (b&0x03030303UL);\
780
            h1= ((a&0xFCFCFCFCUL)>>2)\
781
              + ((b&0xFCFCFCFCUL)>>2);\
782
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
783
            pixels+=line_size;\
784
            block +=line_size;\
785
            a= LD32(pixels  );\
786
            b= LD32(pixels+1);\
787
            l0=  (a&0x03030303UL)\
788
               + (b&0x03030303UL)\
789
               + 0x02020202UL;\
790
            h0= ((a&0xFCFCFCFCUL)>>2)\
791
              + ((b&0xFCFCFCFCUL)>>2);\
792
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
793
            pixels+=line_size;\
794
            block +=line_size;\
795
        }\
796
        pixels+=4-line_size*(h+1);\
797
        block +=4-line_size*h;\
798
    }\
799
}\
800
\
801
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
802
{\
803
    int j;\
804
    for(j=0; j<2; j++){\
805
        int i;\
806
        const uint32_t a= LD32(pixels  );\
807
        const uint32_t b= LD32(pixels+1);\
808
        uint32_t l0=  (a&0x03030303UL)\
809
                    + (b&0x03030303UL)\
810
                    + 0x01010101UL;\
811
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
812
                   + ((b&0xFCFCFCFCUL)>>2);\
813
        uint32_t l1,h1;\
814
\
815
        pixels+=line_size;\
816
        for(i=0; i<h; i+=2){\
817
            uint32_t a= LD32(pixels  );\
818
            uint32_t b= LD32(pixels+1);\
819
            l1=  (a&0x03030303UL)\
820
               + (b&0x03030303UL);\
821
            h1= ((a&0xFCFCFCFCUL)>>2)\
822
              + ((b&0xFCFCFCFCUL)>>2);\
823
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
824
            pixels+=line_size;\
825
            block +=line_size;\
826
            a= LD32(pixels  );\
827
            b= LD32(pixels+1);\
828
            l0=  (a&0x03030303UL)\
829
               + (b&0x03030303UL)\
830
               + 0x01010101UL;\
831
            h0= ((a&0xFCFCFCFCUL)>>2)\
832
              + ((b&0xFCFCFCFCUL)>>2);\
833
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
834
            pixels+=line_size;\
835
            block +=line_size;\
836
        }\
837
        pixels+=4-line_size*(h+1);\
838
        block +=4-line_size*h;\
839
    }\
840
}\
841
\
842
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
843
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
844
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
845
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
846
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
847
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
848
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
849
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
850

    
851
#define op_avg(a, b) a = rnd_avg32(a, b)
852
#endif
853
#define op_put(a, b) a = b
854

    
855
PIXOP2(avg, op_avg)
856
PIXOP2(put, op_put)
857
#undef op_avg
858
#undef op_put
859

    
860
#define avg2(a,b) ((a+b+1)>>1)
861
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
862

    
863

    
864
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
865
{
866
    const int A=(16-x16)*(16-y16);
867
    const int B=(   x16)*(16-y16);
868
    const int C=(16-x16)*(   y16);
869
    const int D=(   x16)*(   y16);
870
    int i;
871

    
872
    for(i=0; i<h; i++)
873
    {
874
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
875
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
876
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
877
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
878
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
879
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
880
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
881
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
882
        dst+= stride;
883
        src+= stride;
884
    }
885
}
886

    
887
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
888
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
889
{
890
    int y, vx, vy;
891
    const int s= 1<<shift;
892
    
893
    width--;
894
    height--;
895

    
896
    for(y=0; y<h; y++){
897
        int x;
898

    
899
        vx= ox;
900
        vy= oy;
901
        for(x=0; x<8; x++){ //XXX FIXME optimize
902
            int src_x, src_y, frac_x, frac_y, index;
903

    
904
            src_x= vx>>16;
905
            src_y= vy>>16;
906
            frac_x= src_x&(s-1);
907
            frac_y= src_y&(s-1);
908
            src_x>>=shift;
909
            src_y>>=shift;
910
  
911
            if((unsigned)src_x < width){
912
                if((unsigned)src_y < height){
913
                    index= src_x + src_y*stride;
914
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
915
                                           + src[index       +1]*   frac_x )*(s-frac_y)
916
                                        + (  src[index+stride  ]*(s-frac_x)
917
                                           + src[index+stride+1]*   frac_x )*   frac_y
918
                                        + r)>>(shift*2);
919
                }else{
920
                    index= src_x + clip(src_y, 0, height)*stride;                    
921
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
922
                                          + src[index       +1]*   frac_x )*s
923
                                        + r)>>(shift*2);
924
                }
925
            }else{
926
                if((unsigned)src_y < height){
927
                    index= clip(src_x, 0, width) + src_y*stride;                    
928
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
929
                                           + src[index+stride  ]*   frac_y )*s
930
                                        + r)>>(shift*2);
931
                }else{
932
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
933
                    dst[y*stride + x]=    src[index         ];
934
                }
935
            }
936
            
937
            vx+= dxx;
938
            vy+= dyx;
939
        }
940
        ox += dxy;
941
        oy += dyy;
942
    }
943
}
944

    
945
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
946
    switch(width){
947
    case 2: put_pixels2_c (dst, src, stride, height); break;
948
    case 4: put_pixels4_c (dst, src, stride, height); break;
949
    case 8: put_pixels8_c (dst, src, stride, height); break;
950
    case 16:put_pixels16_c(dst, src, stride, height); break;
951
    }
952
}
953

    
954
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
955
    int i,j;
956
    for (i=0; i < height; i++) {
957
      for (j=0; j < width; j++) {
958
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
959
      }
960
      src += stride;
961
      dst += stride;
962
    }
963
}
964

    
965
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
966
    int i,j;
967
    for (i=0; i < height; i++) {
968
      for (j=0; j < width; j++) {
969
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
970
      }
971
      src += stride;
972
      dst += stride;
973
    }
974
}
975
    
976
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
977
    int i,j;
978
    for (i=0; i < height; i++) {
979
      for (j=0; j < width; j++) {
980
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
981
      }
982
      src += stride;
983
      dst += stride;
984
    }
985
}
986
    
987
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988
    int i,j;
989
    for (i=0; i < height; i++) {
990
      for (j=0; j < width; j++) {
991
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
992
      }
993
      src += stride;
994
      dst += stride;
995
    }
996
}
997

    
998
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
999
    int i,j;
1000
    for (i=0; i < height; i++) {
1001
      for (j=0; j < width; j++) {
1002
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1003
      }
1004
      src += stride;
1005
      dst += stride;
1006
    }
1007
}
1008

    
1009
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1010
    int i,j;
1011
    for (i=0; i < height; i++) {
1012
      for (j=0; j < width; j++) {
1013
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1014
      }
1015
      src += stride;
1016
      dst += stride;
1017
    }
1018
}
1019

    
1020
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1021
    int i,j;
1022
    for (i=0; i < height; i++) {
1023
      for (j=0; j < width; j++) {
1024
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1025
      }
1026
      src += stride;
1027
      dst += stride;
1028
    }
1029
}
1030

    
1031
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1032
    int i,j;
1033
    for (i=0; i < height; i++) {
1034
      for (j=0; j < width; j++) {
1035
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1036
      }
1037
      src += stride;
1038
      dst += stride;
1039
    }
1040
}
1041

    
1042
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1043
    switch(width){
1044
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1045
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1046
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1047
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1048
    }
1049
}
1050

    
1051
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052
    int i,j;
1053
    for (i=0; i < height; i++) {
1054
      for (j=0; j < width; j++) {
1055
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1056
      }
1057
      src += stride;
1058
      dst += stride;
1059
    }
1060
}
1061

    
1062
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063
    int i,j;
1064
    for (i=0; i < height; i++) {
1065
      for (j=0; j < width; j++) {
1066
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1067
      }
1068
      src += stride;
1069
      dst += stride;
1070
    }
1071
}
1072
    
1073
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074
    int i,j;
1075
    for (i=0; i < height; i++) {
1076
      for (j=0; j < width; j++) {
1077
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1078
      }
1079
      src += stride;
1080
      dst += stride;
1081
    }
1082
}
1083
    
1084
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085
    int i,j;
1086
    for (i=0; i < height; i++) {
1087
      for (j=0; j < width; j++) {
1088
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1089
      }
1090
      src += stride;
1091
      dst += stride;
1092
    }
1093
}
1094

    
1095
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1096
    int i,j;
1097
    for (i=0; i < height; i++) {
1098
      for (j=0; j < width; j++) {
1099
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1100
      }
1101
      src += stride;
1102
      dst += stride;
1103
    }
1104
}
1105

    
1106
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1107
    int i,j;
1108
    for (i=0; i < height; i++) {
1109
      for (j=0; j < width; j++) {
1110
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1111
      }
1112
      src += stride;
1113
      dst += stride;
1114
    }
1115
}
1116

    
1117
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1118
    int i,j;
1119
    for (i=0; i < height; i++) {
1120
      for (j=0; j < width; j++) {
1121
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1122
      }
1123
      src += stride;
1124
      dst += stride;
1125
    }
1126
}
1127

    
1128
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1129
    int i,j;
1130
    for (i=0; i < height; i++) {
1131
      for (j=0; j < width; j++) {
1132
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1133
      }
1134
      src += stride;
1135
      dst += stride;
1136
    }
1137
}
1138
#if 0
1139
#define TPEL_WIDTH(width)\
1140
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1141
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1142
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1143
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1144
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1145
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1146
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1147
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1148
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1149
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1150
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1151
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1152
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1153
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1154
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1156
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1158
#endif
1159

    
1160
#define H264_CHROMA_MC(OPNAME, OP)\
1161
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1162
    const int A=(8-x)*(8-y);\
1163
    const int B=(  x)*(8-y);\
1164
    const int C=(8-x)*(  y);\
1165
    const int D=(  x)*(  y);\
1166
    int i;\
1167
    \
1168
    assert(x<8 && y<8 && x>=0 && y>=0);\
1169
\
1170
    for(i=0; i<h; i++)\
1171
    {\
1172
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1173
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1174
        dst+= stride;\
1175
        src+= stride;\
1176
    }\
1177
}\
1178
\
1179
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1180
    const int A=(8-x)*(8-y);\
1181
    const int B=(  x)*(8-y);\
1182
    const int C=(8-x)*(  y);\
1183
    const int D=(  x)*(  y);\
1184
    int i;\
1185
    \
1186
    assert(x<8 && y<8 && x>=0 && y>=0);\
1187
\
1188
    for(i=0; i<h; i++)\
1189
    {\
1190
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1191
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1192
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1193
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1194
        dst+= stride;\
1195
        src+= stride;\
1196
    }\
1197
}\
1198
\
1199
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1200
    const int A=(8-x)*(8-y);\
1201
    const int B=(  x)*(8-y);\
1202
    const int C=(8-x)*(  y);\
1203
    const int D=(  x)*(  y);\
1204
    int i;\
1205
    \
1206
    assert(x<8 && y<8 && x>=0 && y>=0);\
1207
\
1208
    for(i=0; i<h; i++)\
1209
    {\
1210
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1211
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1212
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1213
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1214
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1215
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1216
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1217
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1218
        dst+= stride;\
1219
        src+= stride;\
1220
    }\
1221
}
1222

    
1223
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1224
#define op_put(a, b) a = (((b) + 32)>>6)
1225

    
1226
H264_CHROMA_MC(put_       , op_put)
1227
H264_CHROMA_MC(avg_       , op_avg)
1228
#undef op_avg
1229
#undef op_put
1230

    
1231
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1232
{
1233
    int i;
1234
    for(i=0; i<h; i++)
1235
    {
1236
        ST32(dst   , LD32(src   ));
1237
        dst+=dstStride;
1238
        src+=srcStride;
1239
    }
1240
}
1241

    
1242
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1243
{
1244
    int i;
1245
    for(i=0; i<h; i++)
1246
    {
1247
        ST32(dst   , LD32(src   ));
1248
        ST32(dst+4 , LD32(src+4 ));
1249
        dst+=dstStride;
1250
        src+=srcStride;
1251
    }
1252
}
1253

    
1254
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1255
{
1256
    int i;
1257
    for(i=0; i<h; i++)
1258
    {
1259
        ST32(dst   , LD32(src   ));
1260
        ST32(dst+4 , LD32(src+4 ));
1261
        ST32(dst+8 , LD32(src+8 ));
1262
        ST32(dst+12, LD32(src+12));
1263
        dst+=dstStride;
1264
        src+=srcStride;
1265
    }
1266
}
1267

    
1268
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1269
{
1270
    int i;
1271
    for(i=0; i<h; i++)
1272
    {
1273
        ST32(dst   , LD32(src   ));
1274
        ST32(dst+4 , LD32(src+4 ));
1275
        ST32(dst+8 , LD32(src+8 ));
1276
        ST32(dst+12, LD32(src+12));
1277
        dst[16]= src[16];
1278
        dst+=dstStride;
1279
        src+=srcStride;
1280
    }
1281
}
1282

    
1283
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1284
{
1285
    int i;
1286
    for(i=0; i<h; i++)
1287
    {
1288
        ST32(dst   , LD32(src   ));
1289
        ST32(dst+4 , LD32(src+4 ));
1290
        dst[8]= src[8];
1291
        dst+=dstStride;
1292
        src+=srcStride;
1293
    }
1294
}
1295

    
1296

    
1297
#define QPEL_MC(r, OPNAME, RND, OP) \
1298
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1299
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1300
    int i;\
1301
    for(i=0; i<h; i++)\
1302
    {\
1303
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1304
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1305
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1306
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1307
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1308
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1309
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1310
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1311
        dst+=dstStride;\
1312
        src+=srcStride;\
1313
    }\
1314
}\
1315
\
1316
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1317
    const int w=8;\
1318
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1319
    int i;\
1320
    for(i=0; i<w; i++)\
1321
    {\
1322
        const int src0= src[0*srcStride];\
1323
        const int src1= src[1*srcStride];\
1324
        const int src2= src[2*srcStride];\
1325
        const int src3= src[3*srcStride];\
1326
        const int src4= src[4*srcStride];\
1327
        const int src5= src[5*srcStride];\
1328
        const int src6= src[6*srcStride];\
1329
        const int src7= src[7*srcStride];\
1330
        const int src8= src[8*srcStride];\
1331
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1332
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1333
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1334
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1335
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1336
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1337
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1338
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1339
        dst++;\
1340
        src++;\
1341
    }\
1342
}\
1343
\
1344
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1345
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1346
    int i;\
1347
    \
1348
    for(i=0; i<h; i++)\
1349
    {\
1350
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1351
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1352
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1353
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1354
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1355
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1356
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1357
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1358
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1359
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1360
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1361
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1362
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1363
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1364
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1365
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1366
        dst+=dstStride;\
1367
        src+=srcStride;\
1368
    }\
1369
}\
1370
\
1371
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1372
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1373
    int i;\
1374
    const int w=16;\
1375
    for(i=0; i<w; i++)\
1376
    {\
1377
        const int src0= src[0*srcStride];\
1378
        const int src1= src[1*srcStride];\
1379
        const int src2= src[2*srcStride];\
1380
        const int src3= src[3*srcStride];\
1381
        const int src4= src[4*srcStride];\
1382
        const int src5= src[5*srcStride];\
1383
        const int src6= src[6*srcStride];\
1384
        const int src7= src[7*srcStride];\
1385
        const int src8= src[8*srcStride];\
1386
        const int src9= src[9*srcStride];\
1387
        const int src10= src[10*srcStride];\
1388
        const int src11= src[11*srcStride];\
1389
        const int src12= src[12*srcStride];\
1390
        const int src13= src[13*srcStride];\
1391
        const int src14= src[14*srcStride];\
1392
        const int src15= src[15*srcStride];\
1393
        const int src16= src[16*srcStride];\
1394
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1395
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1396
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1397
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1398
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1399
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1400
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1401
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1402
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1403
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1404
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1405
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1406
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1407
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1408
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1409
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1410
        dst++;\
1411
        src++;\
1412
    }\
1413
}\
1414
\
1415
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1416
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1417
}\
1418
\
1419
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1420
    uint8_t half[64];\
1421
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1422
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1423
}\
1424
\
1425
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1426
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1427
}\
1428
\
1429
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1430
    uint8_t half[64];\
1431
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1432
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1433
}\
1434
\
1435
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1436
    uint8_t full[16*9];\
1437
    uint8_t half[64];\
1438
    copy_block9(full, src, 16, stride, 9);\
1439
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1440
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1441
}\
1442
\
1443
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1444
    uint8_t full[16*9];\
1445
    copy_block9(full, src, 16, stride, 9);\
1446
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1447
}\
1448
\
1449
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1450
    uint8_t full[16*9];\
1451
    uint8_t half[64];\
1452
    copy_block9(full, src, 16, stride, 9);\
1453
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1455
}\
1456
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458
    uint8_t halfH[72];\
1459
    uint8_t halfV[64];\
1460
    uint8_t halfHV[64];\
1461
    copy_block9(full, src, 16, stride, 9);\
1462
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1463
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1464
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1465
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1466
}\
1467
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1468
    uint8_t full[16*9];\
1469
    uint8_t halfH[72];\
1470
    uint8_t halfHV[64];\
1471
    copy_block9(full, src, 16, stride, 9);\
1472
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1473
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1474
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1475
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1476
}\
1477
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478
    uint8_t full[16*9];\
1479
    uint8_t halfH[72];\
1480
    uint8_t halfV[64];\
1481
    uint8_t halfHV[64];\
1482
    copy_block9(full, src, 16, stride, 9);\
1483
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1485
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1487
}\
1488
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1489
    uint8_t full[16*9];\
1490
    uint8_t halfH[72];\
1491
    uint8_t halfHV[64];\
1492
    copy_block9(full, src, 16, stride, 9);\
1493
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1495
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1497
}\
1498
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499
    uint8_t full[16*9];\
1500
    uint8_t halfH[72];\
1501
    uint8_t halfV[64];\
1502
    uint8_t halfHV[64];\
1503
    copy_block9(full, src, 16, stride, 9);\
1504
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508
}\
1509
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1510
    uint8_t full[16*9];\
1511
    uint8_t halfH[72];\
1512
    uint8_t halfHV[64];\
1513
    copy_block9(full, src, 16, stride, 9);\
1514
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1518
}\
1519
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520
    uint8_t full[16*9];\
1521
    uint8_t halfH[72];\
1522
    uint8_t halfV[64];\
1523
    uint8_t halfHV[64];\
1524
    copy_block9(full, src, 16, stride, 9);\
1525
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1526
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529
}\
1530
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1531
    uint8_t full[16*9];\
1532
    uint8_t halfH[72];\
1533
    uint8_t halfHV[64];\
1534
    copy_block9(full, src, 16, stride, 9);\
1535
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1539
}\
1540
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1541
    uint8_t halfH[72];\
1542
    uint8_t halfHV[64];\
1543
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1544
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1545
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1546
}\
1547
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1548
    uint8_t halfH[72];\
1549
    uint8_t halfHV[64];\
1550
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1551
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1553
}\
1554
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1555
    uint8_t full[16*9];\
1556
    uint8_t halfH[72];\
1557
    uint8_t halfV[64];\
1558
    uint8_t halfHV[64];\
1559
    copy_block9(full, src, 16, stride, 9);\
1560
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1561
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1562
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1563
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1564
}\
1565
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1566
    uint8_t full[16*9];\
1567
    uint8_t halfH[72];\
1568
    copy_block9(full, src, 16, stride, 9);\
1569
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1570
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1571
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1572
}\
1573
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1574
    uint8_t full[16*9];\
1575
    uint8_t halfH[72];\
1576
    uint8_t halfV[64];\
1577
    uint8_t halfHV[64];\
1578
    copy_block9(full, src, 16, stride, 9);\
1579
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1580
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1581
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1582
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1583
}\
1584
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1585
    uint8_t full[16*9];\
1586
    uint8_t halfH[72];\
1587
    copy_block9(full, src, 16, stride, 9);\
1588
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1589
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1590
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1591
}\
1592
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1593
    uint8_t halfH[72];\
1594
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1595
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1596
}\
1597
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1598
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1599
}\
1600
\
1601
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1602
    uint8_t half[256];\
1603
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1604
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1605
}\
1606
\
1607
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1608
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1609
}\
1610
\
1611
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1612
    uint8_t half[256];\
1613
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1614
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1615
}\
1616
\
1617
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1618
    uint8_t full[24*17];\
1619
    uint8_t half[256];\
1620
    copy_block17(full, src, 24, stride, 17);\
1621
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1622
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1623
}\
1624
\
1625
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1626
    uint8_t full[24*17];\
1627
    copy_block17(full, src, 24, stride, 17);\
1628
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1629
}\
1630
\
1631
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1632
    uint8_t full[24*17];\
1633
    uint8_t half[256];\
1634
    copy_block17(full, src, 24, stride, 17);\
1635
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1637
}\
1638
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640
    uint8_t halfH[272];\
1641
    uint8_t halfV[256];\
1642
    uint8_t halfHV[256];\
1643
    copy_block17(full, src, 24, stride, 17);\
1644
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1645
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1646
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1647
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1648
}\
1649
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1650
    uint8_t full[24*17];\
1651
    uint8_t halfH[272];\
1652
    uint8_t halfHV[256];\
1653
    copy_block17(full, src, 24, stride, 17);\
1654
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1655
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1656
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1657
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1658
}\
1659
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660
    uint8_t full[24*17];\
1661
    uint8_t halfH[272];\
1662
    uint8_t halfV[256];\
1663
    uint8_t halfHV[256];\
1664
    copy_block17(full, src, 24, stride, 17);\
1665
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1667
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1669
}\
1670
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1671
    uint8_t full[24*17];\
1672
    uint8_t halfH[272];\
1673
    uint8_t halfHV[256];\
1674
    copy_block17(full, src, 24, stride, 17);\
1675
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1677
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1679
}\
1680
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681
    uint8_t full[24*17];\
1682
    uint8_t halfH[272];\
1683
    uint8_t halfV[256];\
1684
    uint8_t halfHV[256];\
1685
    copy_block17(full, src, 24, stride, 17);\
1686
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690
}\
1691
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1692
    uint8_t full[24*17];\
1693
    uint8_t halfH[272];\
1694
    uint8_t halfHV[256];\
1695
    copy_block17(full, src, 24, stride, 17);\
1696
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1700
}\
1701
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702
    uint8_t full[24*17];\
1703
    uint8_t halfH[272];\
1704
    uint8_t halfV[256];\
1705
    uint8_t halfHV[256];\
1706
    copy_block17(full, src, 24, stride, 17);\
1707
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1708
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711
}\
1712
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[24*17];\
1714
    uint8_t halfH[272];\
1715
    uint8_t halfHV[256];\
1716
    copy_block17(full, src, 24, stride, 17);\
1717
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1721
}\
1722
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1723
    uint8_t halfH[272];\
1724
    uint8_t halfHV[256];\
1725
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1726
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1727
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1728
}\
1729
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1730
    uint8_t halfH[272];\
1731
    uint8_t halfHV[256];\
1732
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1733
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1735
}\
1736
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737
    uint8_t full[24*17];\
1738
    uint8_t halfH[272];\
1739
    uint8_t halfV[256];\
1740
    uint8_t halfHV[256];\
1741
    copy_block17(full, src, 24, stride, 17);\
1742
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1743
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1744
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1745
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1746
}\
1747
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1748
    uint8_t full[24*17];\
1749
    uint8_t halfH[272];\
1750
    copy_block17(full, src, 24, stride, 17);\
1751
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1752
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1753
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1754
}\
1755
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1756
    uint8_t full[24*17];\
1757
    uint8_t halfH[272];\
1758
    uint8_t halfV[256];\
1759
    uint8_t halfHV[256];\
1760
    copy_block17(full, src, 24, stride, 17);\
1761
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1762
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1763
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1764
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1765
}\
1766
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1767
    uint8_t full[24*17];\
1768
    uint8_t halfH[272];\
1769
    copy_block17(full, src, 24, stride, 17);\
1770
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1771
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1772
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1773
}\
1774
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1775
    uint8_t halfH[272];\
1776
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1777
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1778
}
1779

    
1780
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1781
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1782
#define op_put(a, b) a = cm[((b) + 16)>>5]
1783
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1784

    
1785
QPEL_MC(0, put_       , _       , op_put)
1786
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1787
QPEL_MC(0, avg_       , _       , op_avg)
1788
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1789
#undef op_avg
1790
#undef op_avg_no_rnd
1791
#undef op_put
1792
#undef op_put_no_rnd
1793

    
1794
#if 1
1795
#define H264_LOWPASS(OPNAME, OP, OP2) \
1796
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1797
    const int h=4;\
1798
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1799
    int i;\
1800
    for(i=0; i<h; i++)\
1801
    {\
1802
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1803
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1804
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1805
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1806
        dst+=dstStride;\
1807
        src+=srcStride;\
1808
    }\
1809
}\
1810
\
1811
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1812
    const int w=4;\
1813
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1814
    int i;\
1815
    for(i=0; i<w; i++)\
1816
    {\
1817
        const int srcB= src[-2*srcStride];\
1818
        const int srcA= src[-1*srcStride];\
1819
        const int src0= src[0 *srcStride];\
1820
        const int src1= src[1 *srcStride];\
1821
        const int src2= src[2 *srcStride];\
1822
        const int src3= src[3 *srcStride];\
1823
        const int src4= src[4 *srcStride];\
1824
        const int src5= src[5 *srcStride];\
1825
        const int src6= src[6 *srcStride];\
1826
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1827
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1828
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1829
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1830
        dst++;\
1831
        src++;\
1832
    }\
1833
}\
1834
\
1835
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1836
    const int h=4;\
1837
    const int w=4;\
1838
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1839
    int i;\
1840
    src -= 2*srcStride;\
1841
    for(i=0; i<h+5; i++)\
1842
    {\
1843
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1844
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1845
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1846
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1847
        tmp+=tmpStride;\
1848
        src+=srcStride;\
1849
    }\
1850
    tmp -= tmpStride*(h+5-2);\
1851
    for(i=0; i<w; i++)\
1852
    {\
1853
        const int tmpB= tmp[-2*tmpStride];\
1854
        const int tmpA= tmp[-1*tmpStride];\
1855
        const int tmp0= tmp[0 *tmpStride];\
1856
        const int tmp1= tmp[1 *tmpStride];\
1857
        const int tmp2= tmp[2 *tmpStride];\
1858
        const int tmp3= tmp[3 *tmpStride];\
1859
        const int tmp4= tmp[4 *tmpStride];\
1860
        const int tmp5= tmp[5 *tmpStride];\
1861
        const int tmp6= tmp[6 *tmpStride];\
1862
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1863
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1864
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1865
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1866
        dst++;\
1867
        tmp++;\
1868
    }\
1869
}\
1870
\
1871
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1872
    const int h=8;\
1873
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1874
    int i;\
1875
    for(i=0; i<h; i++)\
1876
    {\
1877
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1878
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1879
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1880
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1881
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1882
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1883
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1884
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1885
        dst+=dstStride;\
1886
        src+=srcStride;\
1887
    }\
1888
}\
1889
\
1890
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1891
    const int w=8;\
1892
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1893
    int i;\
1894
    for(i=0; i<w; i++)\
1895
    {\
1896
        const int srcB= src[-2*srcStride];\
1897
        const int srcA= src[-1*srcStride];\
1898
        const int src0= src[0 *srcStride];\
1899
        const int src1= src[1 *srcStride];\
1900
        const int src2= src[2 *srcStride];\
1901
        const int src3= src[3 *srcStride];\
1902
        const int src4= src[4 *srcStride];\
1903
        const int src5= src[5 *srcStride];\
1904
        const int src6= src[6 *srcStride];\
1905
        const int src7= src[7 *srcStride];\
1906
        const int src8= src[8 *srcStride];\
1907
        const int src9= src[9 *srcStride];\
1908
        const int src10=src[10*srcStride];\
1909
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1910
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1911
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1912
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1913
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1914
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1915
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1916
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1917
        dst++;\
1918
        src++;\
1919
    }\
1920
}\
1921
\
1922
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1923
    const int h=8;\
1924
    const int w=8;\
1925
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1926
    int i;\
1927
    src -= 2*srcStride;\
1928
    for(i=0; i<h+5; i++)\
1929
    {\
1930
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1931
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1932
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1933
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1934
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1935
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1936
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1937
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1938
        tmp+=tmpStride;\
1939
        src+=srcStride;\
1940
    }\
1941
    tmp -= tmpStride*(h+5-2);\
1942
    for(i=0; i<w; i++)\
1943
    {\
1944
        const int tmpB= tmp[-2*tmpStride];\
1945
        const int tmpA= tmp[-1*tmpStride];\
1946
        const int tmp0= tmp[0 *tmpStride];\
1947
        const int tmp1= tmp[1 *tmpStride];\
1948
        const int tmp2= tmp[2 *tmpStride];\
1949
        const int tmp3= tmp[3 *tmpStride];\
1950
        const int tmp4= tmp[4 *tmpStride];\
1951
        const int tmp5= tmp[5 *tmpStride];\
1952
        const int tmp6= tmp[6 *tmpStride];\
1953
        const int tmp7= tmp[7 *tmpStride];\
1954
        const int tmp8= tmp[8 *tmpStride];\
1955
        const int tmp9= tmp[9 *tmpStride];\
1956
        const int tmp10=tmp[10*tmpStride];\
1957
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1958
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1959
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1960
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1961
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1962
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1963
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1964
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1965
        dst++;\
1966
        tmp++;\
1967
    }\
1968
}\
1969
\
1970
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1971
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1972
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1973
    src += 8*srcStride;\
1974
    dst += 8*dstStride;\
1975
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1976
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1977
}\
1978
\
1979
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1980
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1981
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1982
    src += 8*srcStride;\
1983
    dst += 8*dstStride;\
1984
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1985
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1986
}\
1987
\
1988
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1989
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1990
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1991
    src += 8*srcStride;\
1992
    tmp += 8*tmpStride;\
1993
    dst += 8*dstStride;\
1994
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1995
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1996
}\
1997

    
1998
#define H264_MC(OPNAME, SIZE) \
1999
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2001
}\
2002
\
2003
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004
    uint8_t half[SIZE*SIZE];\
2005
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2006
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2007
}\
2008
\
2009
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2011
}\
2012
\
2013
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014
    uint8_t half[SIZE*SIZE];\
2015
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2016
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2017
}\
2018
\
2019
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020
    uint8_t full[SIZE*(SIZE+5)];\
2021
    uint8_t * const full_mid= full + SIZE*2;\
2022
    uint8_t half[SIZE*SIZE];\
2023
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2024
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2025
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2026
}\
2027
\
2028
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2029
    uint8_t full[SIZE*(SIZE+5)];\
2030
    uint8_t * const full_mid= full + SIZE*2;\
2031
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2032
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2033
}\
2034
\
2035
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2036
    uint8_t full[SIZE*(SIZE+5)];\
2037
    uint8_t * const full_mid= full + SIZE*2;\
2038
    uint8_t half[SIZE*SIZE];\
2039
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2040
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2041
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2042
}\
2043
\
2044
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2045
    uint8_t full[SIZE*(SIZE+5)];\
2046
    uint8_t * const full_mid= full + SIZE*2;\
2047
    uint8_t halfH[SIZE*SIZE];\
2048
    uint8_t halfV[SIZE*SIZE];\
2049
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2050
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2051
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2052
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2053
}\
2054
\
2055
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2056
    uint8_t full[SIZE*(SIZE+5)];\
2057
    uint8_t * const full_mid= full + SIZE*2;\
2058
    uint8_t halfH[SIZE*SIZE];\
2059
    uint8_t halfV[SIZE*SIZE];\
2060
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2061
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2062
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2063
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2064
}\
2065
\
2066
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2067
    uint8_t full[SIZE*(SIZE+5)];\
2068
    uint8_t * const full_mid= full + SIZE*2;\
2069
    uint8_t halfH[SIZE*SIZE];\
2070
    uint8_t halfV[SIZE*SIZE];\
2071
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2072
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2073
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2074
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2075
}\
2076
\
2077
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2078
    uint8_t full[SIZE*(SIZE+5)];\
2079
    uint8_t * const full_mid= full + SIZE*2;\
2080
    uint8_t halfH[SIZE*SIZE];\
2081
    uint8_t halfV[SIZE*SIZE];\
2082
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2083
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2084
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2085
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2086
}\
2087
\
2088
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2089
    int16_t tmp[SIZE*(SIZE+5)];\
2090
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2091
}\
2092
\
2093
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2094
    int16_t tmp[SIZE*(SIZE+5)];\
2095
    uint8_t halfH[SIZE*SIZE];\
2096
    uint8_t halfHV[SIZE*SIZE];\
2097
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2098
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2099
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2100
}\
2101
\
2102
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2103
    int16_t tmp[SIZE*(SIZE+5)];\
2104
    uint8_t halfH[SIZE*SIZE];\
2105
    uint8_t halfHV[SIZE*SIZE];\
2106
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2107
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2108
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2109
}\
2110
\
2111
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2112
    uint8_t full[SIZE*(SIZE+5)];\
2113
    uint8_t * const full_mid= full + SIZE*2;\
2114
    int16_t tmp[SIZE*(SIZE+5)];\
2115
    uint8_t halfV[SIZE*SIZE];\
2116
    uint8_t halfHV[SIZE*SIZE];\
2117
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2118
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2119
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2121
}\
2122
\
2123
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2124
    uint8_t full[SIZE*(SIZE+5)];\
2125
    uint8_t * const full_mid= full + SIZE*2;\
2126
    int16_t tmp[SIZE*(SIZE+5)];\
2127
    uint8_t halfV[SIZE*SIZE];\
2128
    uint8_t halfHV[SIZE*SIZE];\
2129
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2130
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2131
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2132
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2133
}\
2134

    
2135
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2136
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2137
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2138
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2139
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2140

    
2141
H264_LOWPASS(put_       , op_put, op2_put)
2142
H264_LOWPASS(avg_       , op_avg, op2_avg)
2143
H264_MC(put_, 4)
2144
H264_MC(put_, 8)
2145
H264_MC(put_, 16)
2146
H264_MC(avg_, 4)
2147
H264_MC(avg_, 8)
2148
H264_MC(avg_, 16)
2149

    
2150
#undef op_avg
2151
#undef op_put
2152
#undef op2_avg
2153
#undef op2_put
2154
#endif
2155

    
2156
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2157
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2158
    int i;
2159

    
2160
    for(i=0; i<h; i++){
2161
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2162
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2163
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2164
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2165
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2166
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2167
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2168
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2169
        dst+=dstStride;
2170
        src+=srcStride;        
2171
    }
2172
}
2173

    
2174
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2175
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2176
    int i;
2177

    
2178
    for(i=0; i<w; i++){
2179
        const int src_1= src[ -srcStride];
2180
        const int src0 = src[0          ];
2181
        const int src1 = src[  srcStride];
2182
        const int src2 = src[2*srcStride];
2183
        const int src3 = src[3*srcStride];
2184
        const int src4 = src[4*srcStride];
2185
        const int src5 = src[5*srcStride];
2186
        const int src6 = src[6*srcStride];
2187
        const int src7 = src[7*srcStride];
2188
        const int src8 = src[8*srcStride];
2189
        const int src9 = src[9*srcStride];
2190
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2191
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2192
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2193
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2194
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2195
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2196
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2197
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2198
        src++;
2199
        dst++;
2200
    }
2201
}
2202

    
2203
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2204
    put_pixels8_c(dst, src, stride, 8);
2205
}
2206

    
2207
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2208
    uint8_t half[64];
2209
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2210
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2211
}
2212

    
2213
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2214
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2215
}
2216

    
2217
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2218
    uint8_t half[64];
2219
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2220
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2221
}
2222

    
2223
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2224
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2225
}
2226

    
2227
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2228
    uint8_t halfH[88];
2229
    uint8_t halfV[64];
2230
    uint8_t halfHV[64];
2231
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2232
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2233
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2234
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2235
}
2236
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2237
    uint8_t halfH[88];
2238
    uint8_t halfV[64];
2239
    uint8_t halfHV[64];
2240
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2241
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2242
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2243
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2244
}
2245
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2246
    uint8_t halfH[88];
2247
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2248
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2249
}
2250

    
2251

    
2252
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2253
{
2254
    int s, i;
2255

    
2256
    s = 0;
2257
    for(i=0;i<16;i++) {
2258
        s += abs(pix1[0] - pix2[0]);
2259
        s += abs(pix1[1] - pix2[1]);
2260
        s += abs(pix1[2] - pix2[2]);
2261
        s += abs(pix1[3] - pix2[3]);
2262
        s += abs(pix1[4] - pix2[4]);
2263
        s += abs(pix1[5] - pix2[5]);
2264
        s += abs(pix1[6] - pix2[6]);
2265
        s += abs(pix1[7] - pix2[7]);
2266
        s += abs(pix1[8] - pix2[8]);
2267
        s += abs(pix1[9] - pix2[9]);
2268
        s += abs(pix1[10] - pix2[10]);
2269
        s += abs(pix1[11] - pix2[11]);
2270
        s += abs(pix1[12] - pix2[12]);
2271
        s += abs(pix1[13] - pix2[13]);
2272
        s += abs(pix1[14] - pix2[14]);
2273
        s += abs(pix1[15] - pix2[15]);
2274
        pix1 += line_size;
2275
        pix2 += line_size;
2276
    }
2277
    return s;
2278
}
2279

    
2280
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2281
{
2282
    int s, i;
2283

    
2284
    s = 0;
2285
    for(i=0;i<16;i++) {
2286
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2287
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2288
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2289
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2290
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2291
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2292
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2293
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2294
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2295
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2296
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2297
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2298
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2299
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2300
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2301
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2302
        pix1 += line_size;
2303
        pix2 += line_size;
2304
    }
2305
    return s;
2306
}
2307

    
2308
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2309
{
2310
    int s, i;
2311
    uint8_t *pix3 = pix2 + line_size;
2312

    
2313
    s = 0;
2314
    for(i=0;i<16;i++) {
2315
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2316
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2317
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2318
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2319
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2320
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2321
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2322
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2323
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2324
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2325
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2326
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2327
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2328
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2329
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2330
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2331
        pix1 += line_size;
2332
        pix2 += line_size;
2333
        pix3 += line_size;
2334
    }
2335
    return s;
2336
}
2337

    
2338
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2339
{
2340
    int s, i;
2341
    uint8_t *pix3 = pix2 + line_size;
2342

    
2343
    s = 0;
2344
    for(i=0;i<16;i++) {
2345
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2346
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2347
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2348
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2349
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2350
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2351
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2352
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2353
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2354
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2355
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2356
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2357
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2358
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2359
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2360
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2361
        pix1 += line_size;
2362
        pix2 += line_size;
2363
        pix3 += line_size;
2364
    }
2365
    return s;
2366
}
2367

    
2368
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2369
{
2370
    int s, i;
2371

    
2372
    s = 0;
2373
    for(i=0;i<8;i++) {
2374
        s += abs(pix1[0] - pix2[0]);
2375
        s += abs(pix1[1] - pix2[1]);
2376
        s += abs(pix1[2] - pix2[2]);
2377
        s += abs(pix1[3] - pix2[3]);
2378
        s += abs(pix1[4] - pix2[4]);
2379
        s += abs(pix1[5] - pix2[5]);
2380
        s += abs(pix1[6] - pix2[6]);
2381
        s += abs(pix1[7] - pix2[7]);
2382
        pix1 += line_size;
2383
        pix2 += line_size;
2384
    }
2385
    return s;
2386
}
2387

    
2388
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2389
{
2390
    int s, i;
2391

    
2392
    s = 0;
2393
    for(i=0;i<8;i++) {
2394
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2395
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2396
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2397
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2398
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2399
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2400
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2401
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2402
        pix1 += line_size;
2403
        pix2 += line_size;
2404
    }
2405
    return s;
2406
}
2407

    
2408
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2409
{
2410
    int s, i;
2411
    uint8_t *pix3 = pix2 + line_size;
2412

    
2413
    s = 0;
2414
    for(i=0;i<8;i++) {
2415
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2416
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2417
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2418
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2419
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2420
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2421
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2422
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2423
        pix1 += line_size;
2424
        pix2 += line_size;
2425
        pix3 += line_size;
2426
    }
2427
    return s;
2428
}
2429

    
2430
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2431
{
2432
    int s, i;
2433
    uint8_t *pix3 = pix2 + line_size;
2434

    
2435
    s = 0;
2436
    for(i=0;i<8;i++) {
2437
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2438
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2439
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2440
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2441
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2442
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2443
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2444
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2445
        pix1 += line_size;
2446
        pix2 += line_size;
2447
        pix3 += line_size;
2448
    }
2449
    return s;
2450
}
2451

    
2452
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2453
    return pix_abs16x16_c(a,b,stride);
2454
}
2455

    
2456
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2457
    return pix_abs8x8_c(a,b,stride);
2458
}
2459

    
2460
/**
2461
 * permutes an 8x8 block.
2462
 * @param block the block which will be permuted according to the given permutation vector
2463
 * @param permutation the permutation vector
2464
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2465
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2466
 *                  (inverse) permutated to scantable order!
2467
 */
2468
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2469
{
2470
    int i;
2471
    DCTELEM temp[64];
2472
    
2473
    if(last<=0) return;
2474
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2475

    
2476
    for(i=0; i<=last; i++){
2477
        const int j= scantable[i];
2478
        temp[j]= block[j];
2479
        block[j]=0;
2480
    }
2481
    
2482
    for(i=0; i<=last; i++){
2483
        const int j= scantable[i];
2484
        const int perm_j= permutation[j];
2485
        block[perm_j]= temp[j];
2486
    }
2487
}
2488

    
2489
/**
2490
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2491
 */
2492
static void clear_blocks_c(DCTELEM *blocks)
2493
{
2494
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2495
}
2496

    
2497
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2498
    int i;
2499
    for(i=0; i+7<w; i+=8){
2500
        dst[i+0] += src[i+0];
2501
        dst[i+1] += src[i+1];
2502
        dst[i+2] += src[i+2];
2503
        dst[i+3] += src[i+3];
2504
        dst[i+4] += src[i+4];
2505
        dst[i+5] += src[i+5];
2506
        dst[i+6] += src[i+6];
2507
        dst[i+7] += src[i+7];
2508
    }
2509
    for(; i<w; i++)
2510
        dst[i+0] += src[i+0];
2511
}
2512

    
2513
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2514
    int i;
2515
    for(i=0; i+7<w; i+=8){
2516
        dst[i+0] = src1[i+0]-src2[i+0];
2517
        dst[i+1] = src1[i+1]-src2[i+1];
2518
        dst[i+2] = src1[i+2]-src2[i+2];
2519
        dst[i+3] = src1[i+3]-src2[i+3];
2520
        dst[i+4] = src1[i+4]-src2[i+4];
2521
        dst[i+5] = src1[i+5]-src2[i+5];
2522
        dst[i+6] = src1[i+6]-src2[i+6];
2523
        dst[i+7] = src1[i+7]-src2[i+7];
2524
    }
2525
    for(; i<w; i++)
2526
        dst[i+0] = src1[i+0]-src2[i+0];
2527
}
2528

    
2529
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2530
    int i;
2531
    uint8_t l, lt;
2532

    
2533
    l= *left;
2534
    lt= *left_top;
2535

    
2536
    for(i=0; i<w; i++){
2537
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2538
        lt= src1[i];
2539
        l= src2[i];
2540
        dst[i]= l - pred;
2541
    }    
2542

    
2543
    *left= l;
2544
    *left_top= lt;
2545
}
2546

    
2547
#define BUTTERFLY2(o1,o2,i1,i2) \
2548
o1= (i1)+(i2);\
2549
o2= (i1)-(i2);
2550

    
2551
#define BUTTERFLY1(x,y) \
2552
{\
2553
    int a,b;\
2554
    a= x;\
2555
    b= y;\
2556
    x= a+b;\
2557
    y= a-b;\
2558
}
2559

    
2560
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2561

    
2562
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2563
    int i;
2564
    int temp[64];
2565
    int sum=0;
2566

    
2567
    for(i=0; i<8; i++){
2568
        //FIXME try pointer walks
2569
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2570
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2571
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2572
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2573
        
2574
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2575
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2576
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2577
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2578
        
2579
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2580
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2581
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2582
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2583
    }
2584

    
2585
    for(i=0; i<8; i++){
2586
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2587
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2588
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2589
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2590
        
2591
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2592
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2593
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2594
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2595

    
2596
        sum += 
2597
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2598
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2599
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2600
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2601
    }
2602
#if 0
2603
static int maxi=0;
2604
if(sum>maxi){
2605
    maxi=sum;
2606
    printf("MAX:%d\n", maxi);
2607
}
2608
#endif
2609
    return sum;
2610
}
2611

    
2612
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2613
    int i;
2614
    int temp[64];
2615
    int sum=0;
2616
//FIXME OOOPS ignore 0 term instead of mean mess
2617
    for(i=0; i<8; i++){
2618
        //FIXME try pointer walks
2619
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2620
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2621
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2622
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2623
        
2624
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2625
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2626
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2627
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2628
        
2629
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2630
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2631
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2632
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2633
    }
2634

    
2635
    for(i=0; i<8; i++){
2636
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2637
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2638
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2639
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2640
        
2641
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2642
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2643
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2644
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2645
    
2646
        sum += 
2647
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2648
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2649
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2650
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2651
    }
2652
    
2653
    return sum;
2654
}
2655

    
2656
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2657
    MpegEncContext * const s= (MpegEncContext *)c;
2658
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2659
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2660
    int sum=0, i;
2661

    
2662
    s->dsp.diff_pixels(temp, src1, src2, stride);
2663
    s->dsp.fdct(temp);
2664

    
2665
    for(i=0; i<64; i++)
2666
        sum+= ABS(temp[i]);
2667
        
2668
    return sum;
2669
}
2670

    
2671
void simple_idct(DCTELEM *block); //FIXME
2672

    
2673
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2674
    MpegEncContext * const s= (MpegEncContext *)c;
2675
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2676
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2677
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2678
    int sum=0, i;
2679

    
2680
    s->mb_intra=0;
2681
    
2682
    s->dsp.diff_pixels(temp, src1, src2, stride);
2683
    
2684
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2685
    
2686
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2687
    s->dct_unquantize(s, temp, 0, s->qscale);
2688
    simple_idct(temp); //FIXME 
2689
    
2690
    for(i=0; i<64; i++)
2691
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2692
        
2693
    return sum;
2694
}
2695

    
2696
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2697
    MpegEncContext * const s= (MpegEncContext *)c;
2698
    const uint8_t *scantable= s->intra_scantable.permutated;
2699
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2700
    uint64_t __align8 aligned_bak[stride];
2701
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2702
    uint8_t * const bak= (uint8_t*)aligned_bak;
2703
    int i, last, run, bits, level, distoration, start_i;
2704
    const int esc_length= s->ac_esc_length;
2705
    uint8_t * length;
2706
    uint8_t * last_length;
2707
    
2708
    for(i=0; i<8; i++){
2709
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2710
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2711
    }
2712

    
2713
    s->dsp.diff_pixels(temp, src1, src2, stride);
2714

    
2715
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2716

    
2717
    bits=0;
2718
    
2719
    if (s->mb_intra) {
2720
        start_i = 1; 
2721
        length     = s->intra_ac_vlc_length;
2722
        last_length= s->intra_ac_vlc_last_length;
2723
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2724
    } else {
2725
        start_i = 0;
2726
        length     = s->inter_ac_vlc_length;
2727
        last_length= s->inter_ac_vlc_last_length;
2728
    }
2729
    
2730
    if(last>=start_i){
2731
        run=0;
2732
        for(i=start_i; i<last; i++){
2733
            int j= scantable[i];
2734
            level= temp[j];
2735
        
2736
            if(level){
2737
                level+=64;
2738
                if((level&(~127)) == 0){
2739
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2740
                }else
2741
                    bits+= esc_length;
2742
                run=0;
2743
            }else
2744
                run++;
2745
        }
2746
        i= scantable[last];
2747
       
2748
        level= temp[i] + 64;
2749

    
2750
        assert(level - 64);
2751
        
2752
        if((level&(~127)) == 0){
2753
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2754
        }else
2755
            bits+= esc_length;
2756
    
2757
    }
2758

    
2759
    if(last>=0){
2760
        s->dct_unquantize(s, temp, 0, s->qscale);
2761
    }
2762
    
2763
    s->dsp.idct_add(bak, stride, temp);
2764
    
2765
    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2766

    
2767
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2768
}
2769

    
2770
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2771
    MpegEncContext * const s= (MpegEncContext *)c;
2772
    const uint8_t *scantable= s->intra_scantable.permutated;
2773
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2774
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2775
    int i, last, run, bits, level, start_i;
2776
    const int esc_length= s->ac_esc_length;
2777
    uint8_t * length;
2778
    uint8_t * last_length;
2779
    
2780
    s->dsp.diff_pixels(temp, src1, src2, stride);
2781

    
2782
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2783

    
2784
    bits=0;
2785
    
2786
    if (s->mb_intra) {
2787
        start_i = 1; 
2788
        length     = s->intra_ac_vlc_length;
2789
        last_length= s->intra_ac_vlc_last_length;
2790
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2791
    } else {
2792
        start_i = 0;
2793
        length     = s->inter_ac_vlc_length;
2794
        last_length= s->inter_ac_vlc_last_length;
2795
    }
2796
    
2797
    if(last>=start_i){
2798
        run=0;
2799
        for(i=start_i; i<last; i++){
2800
            int j= scantable[i];
2801
            level= temp[j];
2802
        
2803
            if(level){
2804
                level+=64;
2805
                if((level&(~127)) == 0){
2806
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2807
                }else
2808
                    bits+= esc_length;
2809
                run=0;
2810
            }else
2811
                run++;
2812
        }
2813
        i= scantable[last];
2814
                
2815
        level= temp[i] + 64;
2816
        
2817
        assert(level - 64);
2818
        
2819
        if((level&(~127)) == 0){
2820
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2821
        }else
2822
            bits+= esc_length;
2823
    }
2824

    
2825
    return bits;
2826
}
2827

    
2828

    
2829
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2830
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2831
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2832
WARPER88_1616(rd8x8_c, rd16x16_c)
2833
WARPER88_1616(bit8x8_c, bit16x16_c)
2834

    
2835
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2836
 converted */
2837
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2838
{
2839
    j_rev_dct (block);
2840
    put_pixels_clamped_c(block, dest, line_size);
2841
}
2842
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2843
{
2844
    j_rev_dct (block);
2845
    add_pixels_clamped_c(block, dest, line_size);
2846
}
2847

    
2848
/* init static data */
2849
void dsputil_static_init(void)
2850
{
2851
    int i;
2852

    
2853
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2854
    for(i=0;i<MAX_NEG_CROP;i++) {
2855
        cropTbl[i] = 0;
2856
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
2857
    }
2858
    
2859
    for(i=0;i<512;i++) {
2860
        squareTbl[i] = (i - 256) * (i - 256);
2861
    }
2862
    
2863
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2864
}
2865

    
2866

    
2867
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2868
{
2869
    int i;
2870

    
2871
#ifdef CONFIG_ENCODERS
2872
    if(avctx->dct_algo==FF_DCT_FASTINT)
2873
        c->fdct = fdct_ifast;
2874
    else
2875
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2876
#endif //CONFIG_ENCODERS
2877

    
2878
    if(avctx->idct_algo==FF_IDCT_INT){
2879
        c->idct_put= ff_jref_idct_put;
2880
        c->idct_add= ff_jref_idct_add;
2881
        c->idct    = j_rev_dct;
2882
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2883
    }else{ //accurate/default
2884
        c->idct_put= simple_idct_put;
2885
        c->idct_add= simple_idct_add;
2886
        c->idct    = simple_idct;
2887
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2888
    }
2889

    
2890
    c->get_pixels = get_pixels_c;
2891
    c->diff_pixels = diff_pixels_c;
2892
    c->put_pixels_clamped = put_pixels_clamped_c;
2893
    c->add_pixels_clamped = add_pixels_clamped_c;
2894
    c->gmc1 = gmc1_c;
2895
    c->gmc = gmc_c;
2896
    c->clear_blocks = clear_blocks_c;
2897
    c->pix_sum = pix_sum_c;
2898
    c->pix_norm1 = pix_norm1_c;
2899
    c->sse[0]= sse16_c;
2900
    c->sse[1]= sse8_c;
2901

    
2902
    /* TODO [0] 16  [1] 8 */
2903
    c->pix_abs16x16     = pix_abs16x16_c;
2904
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2905
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2906
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2907
    c->pix_abs8x8     = pix_abs8x8_c;
2908
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2909
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2910
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2911

    
2912
#define dspfunc(PFX, IDX, NUM) \
2913
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2914
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2915
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2916
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2917

    
2918
    dspfunc(put, 0, 16);
2919
    dspfunc(put_no_rnd, 0, 16);
2920
    dspfunc(put, 1, 8);
2921
    dspfunc(put_no_rnd, 1, 8);
2922
    dspfunc(put, 2, 4);
2923
    dspfunc(put, 3, 2);
2924

    
2925
    dspfunc(avg, 0, 16);
2926
    dspfunc(avg_no_rnd, 0, 16);
2927
    dspfunc(avg, 1, 8);
2928
    dspfunc(avg_no_rnd, 1, 8);
2929
    dspfunc(avg, 2, 4);
2930
    dspfunc(avg, 3, 2);
2931
#undef dspfunc
2932

    
2933
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2934
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2935
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2936
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2937
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2938
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2939
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2940
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2941
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2942

    
2943
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2944
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2945
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2946
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2947
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2948
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2949
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2950
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2951
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2952

    
2953
#define dspfunc(PFX, IDX, NUM) \
2954
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2955
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2956
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2957
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2958
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2959
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2960
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2961
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2962
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2963
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2964
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2965
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2966
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2967
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2968
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2969
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2970

    
2971
    dspfunc(put_qpel, 0, 16);
2972
    dspfunc(put_no_rnd_qpel, 0, 16);
2973

    
2974
    dspfunc(avg_qpel, 0, 16);
2975
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2976

    
2977
    dspfunc(put_qpel, 1, 8);
2978
    dspfunc(put_no_rnd_qpel, 1, 8);
2979

    
2980
    dspfunc(avg_qpel, 1, 8);
2981
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2982

    
2983
    dspfunc(put_h264_qpel, 0, 16);
2984
    dspfunc(put_h264_qpel, 1, 8);
2985
    dspfunc(put_h264_qpel, 2, 4);
2986
    dspfunc(avg_h264_qpel, 0, 16);
2987
    dspfunc(avg_h264_qpel, 1, 8);
2988
    dspfunc(avg_h264_qpel, 2, 4);
2989

    
2990
#undef dspfunc
2991
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2992
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2993
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2994
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2995
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2996
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2997

    
2998
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2999
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3000
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3001
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3002
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3003
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3004
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3005
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3006
        
3007
    c->hadamard8_diff[0]= hadamard8_diff16_c;
3008
    c->hadamard8_diff[1]= hadamard8_diff_c;
3009
    c->hadamard8_abs = hadamard8_abs_c;
3010
    
3011
    c->dct_sad[0]= dct_sad16x16_c;
3012
    c->dct_sad[1]= dct_sad8x8_c;
3013
    
3014
    c->sad[0]= sad16x16_c;
3015
    c->sad[1]= sad8x8_c;
3016
    
3017
    c->quant_psnr[0]= quant_psnr16x16_c;
3018
    c->quant_psnr[1]= quant_psnr8x8_c;
3019

    
3020
    c->rd[0]= rd16x16_c;
3021
    c->rd[1]= rd8x8_c;
3022

    
3023
    c->bit[0]= bit16x16_c;
3024
    c->bit[1]= bit8x8_c;
3025
        
3026
    c->add_bytes= add_bytes_c;
3027
    c->diff_bytes= diff_bytes_c;
3028
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3029
    c->bswap_buf= bswap_buf;
3030

    
3031
#ifdef HAVE_MMX
3032
    dsputil_init_mmx(c, avctx);
3033
#endif
3034
#ifdef ARCH_ARMV4L
3035
    dsputil_init_armv4l(c, avctx);
3036
#endif
3037
#ifdef HAVE_MLIB
3038
    dsputil_init_mlib(c, avctx);
3039
#endif
3040
#ifdef ARCH_ALPHA
3041
    dsputil_init_alpha(c, avctx);
3042
#endif
3043
#ifdef ARCH_POWERPC
3044
    dsputil_init_ppc(c, avctx);
3045
#endif
3046
#ifdef HAVE_MMI
3047
    dsputil_init_mmi(c, avctx);
3048
#endif
3049
#ifdef ARCH_SH4
3050
    dsputil_init_sh4(c,avctx);
3051
#endif
3052

    
3053
    switch(c->idct_permutation_type){
3054
    case FF_NO_IDCT_PERM:
3055
        for(i=0; i<64; i++)
3056
            c->idct_permutation[i]= i;
3057
        break;
3058
    case FF_LIBMPEG2_IDCT_PERM:
3059
        for(i=0; i<64; i++)
3060
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3061
        break;
3062
    case FF_SIMPLE_IDCT_PERM:
3063
        for(i=0; i<64; i++)
3064
            c->idct_permutation[i]= simple_mmx_permutation[i];
3065
        break;
3066
    case FF_TRANSPOSE_IDCT_PERM:
3067
        for(i=0; i<64; i++)
3068
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3069
        break;
3070
    default:
3071
        fprintf(stderr, "Internal error, IDCT permutation not set\n");
3072
    }
3073
}
3074