Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 68ca24e6

History | View | Annotate | Download (115 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27
#include "avcodec.h"
28
#include "dsputil.h"
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
31
#include "faandct.h"
32

    
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35

    
36
const uint8_t ff_zigzag_direct[64] = {
37
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39
    12, 19, 26, 33, 40, 48, 41, 34,
40
    27, 20, 13,  6,  7, 14, 21, 28,
41
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46

    
47
/* Specific zigzag scan for 248 idct. NOTE that unlike the
48
   specification, we interleave the fields */
49
const uint8_t ff_zigzag248_direct[64] = {
50
     0,  8,  1,  9, 16, 24,  2, 10,
51
    17, 25, 32, 40, 48, 56, 33, 41,
52
    18, 26,  3, 11,  4, 12, 19, 27,
53
    34, 42, 49, 57, 50, 58, 35, 43,
54
    20, 28,  5, 13,  6, 14, 21, 29,
55
    36, 44, 51, 59, 52, 60, 37, 45,
56
    22, 30,  7, 15, 23, 31, 38, 46,
57
    53, 61, 54, 62, 39, 47, 55, 63,
58
};
59

    
60
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
61
uint16_t __align8 inv_zigzag_direct16[64];
62

    
63
const uint8_t ff_alternate_horizontal_scan[64] = {
64
    0,  1,   2,  3,  8,  9, 16, 17, 
65
    10, 11,  4,  5,  6,  7, 15, 14,
66
    13, 12, 19, 18, 24, 25, 32, 33, 
67
    26, 27, 20, 21, 22, 23, 28, 29,
68
    30, 31, 34, 35, 40, 41, 48, 49, 
69
    42, 43, 36, 37, 38, 39, 44, 45,
70
    46, 47, 50, 51, 56, 57, 58, 59, 
71
    52, 53, 54, 55, 60, 61, 62, 63,
72
};
73

    
74
const uint8_t ff_alternate_vertical_scan[64] = {
75
    0,  8,  16, 24,  1,  9,  2, 10, 
76
    17, 25, 32, 40, 48, 56, 57, 49,
77
    41, 33, 26, 18,  3, 11,  4, 12, 
78
    19, 27, 34, 42, 50, 58, 35, 43,
79
    51, 59, 20, 28,  5, 13,  6, 14, 
80
    21, 29, 36, 44, 52, 60, 37, 45,
81
    53, 61, 22, 30,  7, 15, 23, 31, 
82
    38, 46, 54, 62, 39, 47, 55, 63,
83
};
84

    
85
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
86
const uint32_t inverse[256]={
87
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
88
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
89
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
90
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
91
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
92
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
93
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
94
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
95
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
96
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
97
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
98
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
99
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
100
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
101
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
102
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
103
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
104
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
105
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
106
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
107
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
108
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
109
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
110
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
111
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
112
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
113
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
114
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
115
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
116
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
117
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
118
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
119
};
120

    
121
/* Input permutation for the simple_idct_mmx */
122
static const uint8_t simple_mmx_permutation[64]={
123
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
124
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
125
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
126
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
127
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
128
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
129
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
130
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
131
};
132

    
133
static int pix_sum_c(uint8_t * pix, int line_size)
134
{
135
    int s, i, j;
136

    
137
    s = 0;
138
    for (i = 0; i < 16; i++) {
139
        for (j = 0; j < 16; j += 8) {
140
            s += pix[0];
141
            s += pix[1];
142
            s += pix[2];
143
            s += pix[3];
144
            s += pix[4];
145
            s += pix[5];
146
            s += pix[6];
147
            s += pix[7];
148
            pix += 8;
149
        }
150
        pix += line_size - 16;
151
    }
152
    return s;
153
}
154

    
155
static int pix_norm1_c(uint8_t * pix, int line_size)
156
{
157
    int s, i, j;
158
    uint32_t *sq = squareTbl + 256;
159

    
160
    s = 0;
161
    for (i = 0; i < 16; i++) {
162
        for (j = 0; j < 16; j += 8) {
163
#if 0
164
            s += sq[pix[0]];
165
            s += sq[pix[1]];
166
            s += sq[pix[2]];
167
            s += sq[pix[3]];
168
            s += sq[pix[4]];
169
            s += sq[pix[5]];
170
            s += sq[pix[6]];
171
            s += sq[pix[7]];
172
#else
173
#if LONG_MAX > 2147483647
174
            register uint64_t x=*(uint64_t*)pix;
175
            s += sq[x&0xff];
176
            s += sq[(x>>8)&0xff];
177
            s += sq[(x>>16)&0xff];
178
            s += sq[(x>>24)&0xff];
179
            s += sq[(x>>32)&0xff];
180
            s += sq[(x>>40)&0xff];
181
            s += sq[(x>>48)&0xff];
182
            s += sq[(x>>56)&0xff];
183
#else
184
            register uint32_t x=*(uint32_t*)pix;
185
            s += sq[x&0xff];
186
            s += sq[(x>>8)&0xff];
187
            s += sq[(x>>16)&0xff];
188
            s += sq[(x>>24)&0xff];
189
            x=*(uint32_t*)(pix+4);
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
#endif
195
#endif
196
            pix += 8;
197
        }
198
        pix += line_size - 16;
199
    }
200
    return s;
201
}
202

    
203
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
204
    int i;
205
    
206
    for(i=0; i+8<=w; i+=8){
207
        dst[i+0]= bswap_32(src[i+0]);
208
        dst[i+1]= bswap_32(src[i+1]);
209
        dst[i+2]= bswap_32(src[i+2]);
210
        dst[i+3]= bswap_32(src[i+3]);
211
        dst[i+4]= bswap_32(src[i+4]);
212
        dst[i+5]= bswap_32(src[i+5]);
213
        dst[i+6]= bswap_32(src[i+6]);
214
        dst[i+7]= bswap_32(src[i+7]);
215
    }
216
    for(;i<w; i++){
217
        dst[i+0]= bswap_32(src[i+0]);
218
    }
219
}
220

    
221
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
222
{
223
    int s, i;
224
    uint32_t *sq = squareTbl + 256;
225

    
226
    s = 0;
227
    for (i = 0; i < 8; i++) {
228
        s += sq[pix1[0] - pix2[0]];
229
        s += sq[pix1[1] - pix2[1]];
230
        s += sq[pix1[2] - pix2[2]];
231
        s += sq[pix1[3] - pix2[3]];
232
        s += sq[pix1[4] - pix2[4]];
233
        s += sq[pix1[5] - pix2[5]];
234
        s += sq[pix1[6] - pix2[6]];
235
        s += sq[pix1[7] - pix2[7]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
243
{
244
    int s, i;
245
    uint32_t *sq = squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < 16; i++) {
249
        s += sq[pix1[ 0] - pix2[ 0]];
250
        s += sq[pix1[ 1] - pix2[ 1]];
251
        s += sq[pix1[ 2] - pix2[ 2]];
252
        s += sq[pix1[ 3] - pix2[ 3]];
253
        s += sq[pix1[ 4] - pix2[ 4]];
254
        s += sq[pix1[ 5] - pix2[ 5]];
255
        s += sq[pix1[ 6] - pix2[ 6]];
256
        s += sq[pix1[ 7] - pix2[ 7]];
257
        s += sq[pix1[ 8] - pix2[ 8]];
258
        s += sq[pix1[ 9] - pix2[ 9]];
259
        s += sq[pix1[10] - pix2[10]];
260
        s += sq[pix1[11] - pix2[11]];
261
        s += sq[pix1[12] - pix2[12]];
262
        s += sq[pix1[13] - pix2[13]];
263
        s += sq[pix1[14] - pix2[14]];
264
        s += sq[pix1[15] - pix2[15]];
265

    
266
        pix1 += line_size;
267
        pix2 += line_size;
268
    }
269
    return s;
270
}
271

    
272
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
273
{
274
    int i;
275

    
276
    /* read the pixels */
277
    for(i=0;i<8;i++) {
278
        block[0] = pixels[0];
279
        block[1] = pixels[1];
280
        block[2] = pixels[2];
281
        block[3] = pixels[3];
282
        block[4] = pixels[4];
283
        block[5] = pixels[5];
284
        block[6] = pixels[6];
285
        block[7] = pixels[7];
286
        pixels += line_size;
287
        block += 8;
288
    }
289
}
290

    
291
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
292
                          const uint8_t *s2, int stride){
293
    int i;
294

    
295
    /* read the pixels */
296
    for(i=0;i<8;i++) {
297
        block[0] = s1[0] - s2[0];
298
        block[1] = s1[1] - s2[1];
299
        block[2] = s1[2] - s2[2];
300
        block[3] = s1[3] - s2[3];
301
        block[4] = s1[4] - s2[4];
302
        block[5] = s1[5] - s2[5];
303
        block[6] = s1[6] - s2[6];
304
        block[7] = s1[7] - s2[7];
305
        s1 += stride;
306
        s2 += stride;
307
        block += 8;
308
    }
309
}
310

    
311

    
312
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
313
                                 int line_size)
314
{
315
    int i;
316
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
317
    
318
    /* read the pixels */
319
    for(i=0;i<8;i++) {
320
        pixels[0] = cm[block[0]];
321
        pixels[1] = cm[block[1]];
322
        pixels[2] = cm[block[2]];
323
        pixels[3] = cm[block[3]];
324
        pixels[4] = cm[block[4]];
325
        pixels[5] = cm[block[5]];
326
        pixels[6] = cm[block[6]];
327
        pixels[7] = cm[block[7]];
328

    
329
        pixels += line_size;
330
        block += 8;
331
    }
332
}
333

    
334
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335
                          int line_size)
336
{
337
    int i;
338
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
339
    
340
    /* read the pixels */
341
    for(i=0;i<8;i++) {
342
        pixels[0] = cm[pixels[0] + block[0]];
343
        pixels[1] = cm[pixels[1] + block[1]];
344
        pixels[2] = cm[pixels[2] + block[2]];
345
        pixels[3] = cm[pixels[3] + block[3]];
346
        pixels[4] = cm[pixels[4] + block[4]];
347
        pixels[5] = cm[pixels[5] + block[5]];
348
        pixels[6] = cm[pixels[6] + block[6]];
349
        pixels[7] = cm[pixels[7] + block[7]];
350
        pixels += line_size;
351
        block += 8;
352
    }
353
}
354
#if 0
355

356
#define PIXOP2(OPNAME, OP) \
357
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358
{\
359
    int i;\
360
    for(i=0; i<h; i++){\
361
        OP(*((uint64_t*)block), LD64(pixels));\
362
        pixels+=line_size;\
363
        block +=line_size;\
364
    }\
365
}\
366
\
367
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368
{\
369
    int i;\
370
    for(i=0; i<h; i++){\
371
        const uint64_t a= LD64(pixels  );\
372
        const uint64_t b= LD64(pixels+1);\
373
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
374
        pixels+=line_size;\
375
        block +=line_size;\
376
    }\
377
}\
378
\
379
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380
{\
381
    int i;\
382
    for(i=0; i<h; i++){\
383
        const uint64_t a= LD64(pixels  );\
384
        const uint64_t b= LD64(pixels+1);\
385
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
386
        pixels+=line_size;\
387
        block +=line_size;\
388
    }\
389
}\
390
\
391
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
392
{\
393
    int i;\
394
    for(i=0; i<h; i++){\
395
        const uint64_t a= LD64(pixels          );\
396
        const uint64_t b= LD64(pixels+line_size);\
397
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
398
        pixels+=line_size;\
399
        block +=line_size;\
400
    }\
401
}\
402
\
403
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
404
{\
405
    int i;\
406
    for(i=0; i<h; i++){\
407
        const uint64_t a= LD64(pixels          );\
408
        const uint64_t b= LD64(pixels+line_size);\
409
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
410
        pixels+=line_size;\
411
        block +=line_size;\
412
    }\
413
}\
414
\
415
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
416
{\
417
        int i;\
418
        const uint64_t a= LD64(pixels  );\
419
        const uint64_t b= LD64(pixels+1);\
420
        uint64_t l0=  (a&0x0303030303030303ULL)\
421
                    + (b&0x0303030303030303ULL)\
422
                    + 0x0202020202020202ULL;\
423
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
425
        uint64_t l1,h1;\
426
\
427
        pixels+=line_size;\
428
        for(i=0; i<h; i+=2){\
429
            uint64_t a= LD64(pixels  );\
430
            uint64_t b= LD64(pixels+1);\
431
            l1=  (a&0x0303030303030303ULL)\
432
               + (b&0x0303030303030303ULL);\
433
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
434
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
435
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
436
            pixels+=line_size;\
437
            block +=line_size;\
438
            a= LD64(pixels  );\
439
            b= LD64(pixels+1);\
440
            l0=  (a&0x0303030303030303ULL)\
441
               + (b&0x0303030303030303ULL)\
442
               + 0x0202020202020202ULL;\
443
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
444
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
445
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446
            pixels+=line_size;\
447
            block +=line_size;\
448
        }\
449
}\
450
\
451
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452
{\
453
        int i;\
454
        const uint64_t a= LD64(pixels  );\
455
        const uint64_t b= LD64(pixels+1);\
456
        uint64_t l0=  (a&0x0303030303030303ULL)\
457
                    + (b&0x0303030303030303ULL)\
458
                    + 0x0101010101010101ULL;\
459
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
461
        uint64_t l1,h1;\
462
\
463
        pixels+=line_size;\
464
        for(i=0; i<h; i+=2){\
465
            uint64_t a= LD64(pixels  );\
466
            uint64_t b= LD64(pixels+1);\
467
            l1=  (a&0x0303030303030303ULL)\
468
               + (b&0x0303030303030303ULL);\
469
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
470
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
471
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
472
            pixels+=line_size;\
473
            block +=line_size;\
474
            a= LD64(pixels  );\
475
            b= LD64(pixels+1);\
476
            l0=  (a&0x0303030303030303ULL)\
477
               + (b&0x0303030303030303ULL)\
478
               + 0x0101010101010101ULL;\
479
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
480
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
481
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
482
            pixels+=line_size;\
483
            block +=line_size;\
484
        }\
485
}\
486
\
487
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
488
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
489
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
490
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
491
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
492
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
493
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
494

495
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
496
#else // 64 bit variant
497

    
498
#define PIXOP2(OPNAME, OP) \
499
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
500
    int i;\
501
    for(i=0; i<h; i++){\
502
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
503
        pixels+=line_size;\
504
        block +=line_size;\
505
    }\
506
}\
507
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
508
    int i;\
509
    for(i=0; i<h; i++){\
510
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
511
        pixels+=line_size;\
512
        block +=line_size;\
513
    }\
514
}\
515
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
516
    int i;\
517
    for(i=0; i<h; i++){\
518
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
519
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
520
        pixels+=line_size;\
521
        block +=line_size;\
522
    }\
523
}\
524
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
525
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
526
}\
527
\
528
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
529
                                                int src_stride1, int src_stride2, int h){\
530
    int i;\
531
    for(i=0; i<h; i++){\
532
        uint32_t a,b;\
533
        a= LD32(&src1[i*src_stride1  ]);\
534
        b= LD32(&src2[i*src_stride2  ]);\
535
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
536
        a= LD32(&src1[i*src_stride1+4]);\
537
        b= LD32(&src2[i*src_stride2+4]);\
538
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
539
    }\
540
}\
541
\
542
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
543
                                                int src_stride1, int src_stride2, int h){\
544
    int i;\
545
    for(i=0; i<h; i++){\
546
        uint32_t a,b;\
547
        a= LD32(&src1[i*src_stride1  ]);\
548
        b= LD32(&src2[i*src_stride2  ]);\
549
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
550
        a= LD32(&src1[i*src_stride1+4]);\
551
        b= LD32(&src2[i*src_stride2+4]);\
552
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
553
    }\
554
}\
555
\
556
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
557
                                                int src_stride1, int src_stride2, int h){\
558
    int i;\
559
    for(i=0; i<h; i++){\
560
        uint32_t a,b;\
561
        a= LD32(&src1[i*src_stride1  ]);\
562
        b= LD32(&src2[i*src_stride2  ]);\
563
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
564
    }\
565
}\
566
\
567
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
568
                                                int src_stride1, int src_stride2, int h){\
569
    int i;\
570
    for(i=0; i<h; i++){\
571
        uint32_t a,b;\
572
        a= LD16(&src1[i*src_stride1  ]);\
573
        b= LD16(&src2[i*src_stride2  ]);\
574
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
575
    }\
576
}\
577
\
578
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579
                                                int src_stride1, int src_stride2, int h){\
580
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
581
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
582
}\
583
\
584
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
585
                                                int src_stride1, int src_stride2, int h){\
586
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
587
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
588
}\
589
\
590
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
591
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
592
}\
593
\
594
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
595
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
596
}\
597
\
598
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
599
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
600
}\
601
\
602
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
603
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
604
}\
605
\
606
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
607
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
608
    int i;\
609
    for(i=0; i<h; i++){\
610
        uint32_t a, b, c, d, l0, l1, h0, h1;\
611
        a= LD32(&src1[i*src_stride1]);\
612
        b= LD32(&src2[i*src_stride2]);\
613
        c= LD32(&src3[i*src_stride3]);\
614
        d= LD32(&src4[i*src_stride4]);\
615
        l0=  (a&0x03030303UL)\
616
           + (b&0x03030303UL)\
617
           + 0x02020202UL;\
618
        h0= ((a&0xFCFCFCFCUL)>>2)\
619
          + ((b&0xFCFCFCFCUL)>>2);\
620
        l1=  (c&0x03030303UL)\
621
           + (d&0x03030303UL);\
622
        h1= ((c&0xFCFCFCFCUL)>>2)\
623
          + ((d&0xFCFCFCFCUL)>>2);\
624
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625
        a= LD32(&src1[i*src_stride1+4]);\
626
        b= LD32(&src2[i*src_stride2+4]);\
627
        c= LD32(&src3[i*src_stride3+4]);\
628
        d= LD32(&src4[i*src_stride4+4]);\
629
        l0=  (a&0x03030303UL)\
630
           + (b&0x03030303UL)\
631
           + 0x02020202UL;\
632
        h0= ((a&0xFCFCFCFCUL)>>2)\
633
          + ((b&0xFCFCFCFCUL)>>2);\
634
        l1=  (c&0x03030303UL)\
635
           + (d&0x03030303UL);\
636
        h1= ((c&0xFCFCFCFCUL)>>2)\
637
          + ((d&0xFCFCFCFCUL)>>2);\
638
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
639
    }\
640
}\
641
\
642
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
643
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
644
}\
645
\
646
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
647
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
648
}\
649
\
650
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
651
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
652
}\
653
\
654
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
655
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
656
}\
657
\
658
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
659
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
660
    int i;\
661
    for(i=0; i<h; i++){\
662
        uint32_t a, b, c, d, l0, l1, h0, h1;\
663
        a= LD32(&src1[i*src_stride1]);\
664
        b= LD32(&src2[i*src_stride2]);\
665
        c= LD32(&src3[i*src_stride3]);\
666
        d= LD32(&src4[i*src_stride4]);\
667
        l0=  (a&0x03030303UL)\
668
           + (b&0x03030303UL)\
669
           + 0x01010101UL;\
670
        h0= ((a&0xFCFCFCFCUL)>>2)\
671
          + ((b&0xFCFCFCFCUL)>>2);\
672
        l1=  (c&0x03030303UL)\
673
           + (d&0x03030303UL);\
674
        h1= ((c&0xFCFCFCFCUL)>>2)\
675
          + ((d&0xFCFCFCFCUL)>>2);\
676
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
677
        a= LD32(&src1[i*src_stride1+4]);\
678
        b= LD32(&src2[i*src_stride2+4]);\
679
        c= LD32(&src3[i*src_stride3+4]);\
680
        d= LD32(&src4[i*src_stride4+4]);\
681
        l0=  (a&0x03030303UL)\
682
           + (b&0x03030303UL)\
683
           + 0x01010101UL;\
684
        h0= ((a&0xFCFCFCFCUL)>>2)\
685
          + ((b&0xFCFCFCFCUL)>>2);\
686
        l1=  (c&0x03030303UL)\
687
           + (d&0x03030303UL);\
688
        h1= ((c&0xFCFCFCFCUL)>>2)\
689
          + ((d&0xFCFCFCFCUL)>>2);\
690
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
691
    }\
692
}\
693
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
694
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
695
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
696
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697
}\
698
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
699
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
700
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
701
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702
}\
703
\
704
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705
{\
706
        int i, a0, b0, a1, b1;\
707
        a0= pixels[0];\
708
        b0= pixels[1] + 2;\
709
        a0 += b0;\
710
        b0 += pixels[2];\
711
\
712
        pixels+=line_size;\
713
        for(i=0; i<h; i+=2){\
714
            a1= pixels[0];\
715
            b1= pixels[1];\
716
            a1 += b1;\
717
            b1 += pixels[2];\
718
\
719
            block[0]= (a1+a0)>>2; /* FIXME non put */\
720
            block[1]= (b1+b0)>>2;\
721
\
722
            pixels+=line_size;\
723
            block +=line_size;\
724
\
725
            a0= pixels[0];\
726
            b0= pixels[1] + 2;\
727
            a0 += b0;\
728
            b0 += pixels[2];\
729
\
730
            block[0]= (a1+a0)>>2;\
731
            block[1]= (b1+b0)>>2;\
732
            pixels+=line_size;\
733
            block +=line_size;\
734
        }\
735
}\
736
\
737
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
738
{\
739
        int i;\
740
        const uint32_t a= LD32(pixels  );\
741
        const uint32_t b= LD32(pixels+1);\
742
        uint32_t l0=  (a&0x03030303UL)\
743
                    + (b&0x03030303UL)\
744
                    + 0x02020202UL;\
745
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
746
                   + ((b&0xFCFCFCFCUL)>>2);\
747
        uint32_t l1,h1;\
748
\
749
        pixels+=line_size;\
750
        for(i=0; i<h; i+=2){\
751
            uint32_t a= LD32(pixels  );\
752
            uint32_t b= LD32(pixels+1);\
753
            l1=  (a&0x03030303UL)\
754
               + (b&0x03030303UL);\
755
            h1= ((a&0xFCFCFCFCUL)>>2)\
756
              + ((b&0xFCFCFCFCUL)>>2);\
757
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
758
            pixels+=line_size;\
759
            block +=line_size;\
760
            a= LD32(pixels  );\
761
            b= LD32(pixels+1);\
762
            l0=  (a&0x03030303UL)\
763
               + (b&0x03030303UL)\
764
               + 0x02020202UL;\
765
            h0= ((a&0xFCFCFCFCUL)>>2)\
766
              + ((b&0xFCFCFCFCUL)>>2);\
767
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
768
            pixels+=line_size;\
769
            block +=line_size;\
770
        }\
771
}\
772
\
773
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774
{\
775
    int j;\
776
    for(j=0; j<2; j++){\
777
        int i;\
778
        const uint32_t a= LD32(pixels  );\
779
        const uint32_t b= LD32(pixels+1);\
780
        uint32_t l0=  (a&0x03030303UL)\
781
                    + (b&0x03030303UL)\
782
                    + 0x02020202UL;\
783
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
784
                   + ((b&0xFCFCFCFCUL)>>2);\
785
        uint32_t l1,h1;\
786
\
787
        pixels+=line_size;\
788
        for(i=0; i<h; i+=2){\
789
            uint32_t a= LD32(pixels  );\
790
            uint32_t b= LD32(pixels+1);\
791
            l1=  (a&0x03030303UL)\
792
               + (b&0x03030303UL);\
793
            h1= ((a&0xFCFCFCFCUL)>>2)\
794
              + ((b&0xFCFCFCFCUL)>>2);\
795
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796
            pixels+=line_size;\
797
            block +=line_size;\
798
            a= LD32(pixels  );\
799
            b= LD32(pixels+1);\
800
            l0=  (a&0x03030303UL)\
801
               + (b&0x03030303UL)\
802
               + 0x02020202UL;\
803
            h0= ((a&0xFCFCFCFCUL)>>2)\
804
              + ((b&0xFCFCFCFCUL)>>2);\
805
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
806
            pixels+=line_size;\
807
            block +=line_size;\
808
        }\
809
        pixels+=4-line_size*(h+1);\
810
        block +=4-line_size*h;\
811
    }\
812
}\
813
\
814
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
815
{\
816
    int j;\
817
    for(j=0; j<2; j++){\
818
        int i;\
819
        const uint32_t a= LD32(pixels  );\
820
        const uint32_t b= LD32(pixels+1);\
821
        uint32_t l0=  (a&0x03030303UL)\
822
                    + (b&0x03030303UL)\
823
                    + 0x01010101UL;\
824
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
825
                   + ((b&0xFCFCFCFCUL)>>2);\
826
        uint32_t l1,h1;\
827
\
828
        pixels+=line_size;\
829
        for(i=0; i<h; i+=2){\
830
            uint32_t a= LD32(pixels  );\
831
            uint32_t b= LD32(pixels+1);\
832
            l1=  (a&0x03030303UL)\
833
               + (b&0x03030303UL);\
834
            h1= ((a&0xFCFCFCFCUL)>>2)\
835
              + ((b&0xFCFCFCFCUL)>>2);\
836
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837
            pixels+=line_size;\
838
            block +=line_size;\
839
            a= LD32(pixels  );\
840
            b= LD32(pixels+1);\
841
            l0=  (a&0x03030303UL)\
842
               + (b&0x03030303UL)\
843
               + 0x01010101UL;\
844
            h0= ((a&0xFCFCFCFCUL)>>2)\
845
              + ((b&0xFCFCFCFCUL)>>2);\
846
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847
            pixels+=line_size;\
848
            block +=line_size;\
849
        }\
850
        pixels+=4-line_size*(h+1);\
851
        block +=4-line_size*h;\
852
    }\
853
}\
854
\
855
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
856
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
857
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
858
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
859
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
860
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
861
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
862
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
863

    
864
#define op_avg(a, b) a = rnd_avg32(a, b)
865
#endif
866
#define op_put(a, b) a = b
867

    
868
PIXOP2(avg, op_avg)
869
PIXOP2(put, op_put)
870
#undef op_avg
871
#undef op_put
872

    
873
#define avg2(a,b) ((a+b+1)>>1)
874
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
875

    
876

    
877
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
878
{
879
    const int A=(16-x16)*(16-y16);
880
    const int B=(   x16)*(16-y16);
881
    const int C=(16-x16)*(   y16);
882
    const int D=(   x16)*(   y16);
883
    int i;
884

    
885
    for(i=0; i<h; i++)
886
    {
887
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
888
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
889
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
890
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
891
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
892
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
893
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
894
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
895
        dst+= stride;
896
        src+= stride;
897
    }
898
}
899

    
900
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
901
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
902
{
903
    int y, vx, vy;
904
    const int s= 1<<shift;
905
    
906
    width--;
907
    height--;
908

    
909
    for(y=0; y<h; y++){
910
        int x;
911

    
912
        vx= ox;
913
        vy= oy;
914
        for(x=0; x<8; x++){ //XXX FIXME optimize
915
            int src_x, src_y, frac_x, frac_y, index;
916

    
917
            src_x= vx>>16;
918
            src_y= vy>>16;
919
            frac_x= src_x&(s-1);
920
            frac_y= src_y&(s-1);
921
            src_x>>=shift;
922
            src_y>>=shift;
923
  
924
            if((unsigned)src_x < width){
925
                if((unsigned)src_y < height){
926
                    index= src_x + src_y*stride;
927
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
928
                                           + src[index       +1]*   frac_x )*(s-frac_y)
929
                                        + (  src[index+stride  ]*(s-frac_x)
930
                                           + src[index+stride+1]*   frac_x )*   frac_y
931
                                        + r)>>(shift*2);
932
                }else{
933
                    index= src_x + clip(src_y, 0, height)*stride;                    
934
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
935
                                          + src[index       +1]*   frac_x )*s
936
                                        + r)>>(shift*2);
937
                }
938
            }else{
939
                if((unsigned)src_y < height){
940
                    index= clip(src_x, 0, width) + src_y*stride;                    
941
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
942
                                           + src[index+stride  ]*   frac_y )*s
943
                                        + r)>>(shift*2);
944
                }else{
945
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
946
                    dst[y*stride + x]=    src[index         ];
947
                }
948
            }
949
            
950
            vx+= dxx;
951
            vy+= dyx;
952
        }
953
        ox += dxy;
954
        oy += dyy;
955
    }
956
}
957

    
958
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
959
    switch(width){
960
    case 2: put_pixels2_c (dst, src, stride, height); break;
961
    case 4: put_pixels4_c (dst, src, stride, height); break;
962
    case 8: put_pixels8_c (dst, src, stride, height); break;
963
    case 16:put_pixels16_c(dst, src, stride, height); break;
964
    }
965
}
966

    
967
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
968
    int i,j;
969
    for (i=0; i < height; i++) {
970
      for (j=0; j < width; j++) {
971
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
972
      }
973
      src += stride;
974
      dst += stride;
975
    }
976
}
977

    
978
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
979
    int i,j;
980
    for (i=0; i < height; i++) {
981
      for (j=0; j < width; j++) {
982
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
983
      }
984
      src += stride;
985
      dst += stride;
986
    }
987
}
988
    
989
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
990
    int i,j;
991
    for (i=0; i < height; i++) {
992
      for (j=0; j < width; j++) {
993
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
994
      }
995
      src += stride;
996
      dst += stride;
997
    }
998
}
999
    
1000
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001
    int i,j;
1002
    for (i=0; i < height; i++) {
1003
      for (j=0; j < width; j++) {
1004
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005
      }
1006
      src += stride;
1007
      dst += stride;
1008
    }
1009
}
1010

    
1011
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012
    int i,j;
1013
    for (i=0; i < height; i++) {
1014
      for (j=0; j < width; j++) {
1015
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016
      }
1017
      src += stride;
1018
      dst += stride;
1019
    }
1020
}
1021

    
1022
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023
    int i,j;
1024
    for (i=0; i < height; i++) {
1025
      for (j=0; j < width; j++) {
1026
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027
      }
1028
      src += stride;
1029
      dst += stride;
1030
    }
1031
}
1032

    
1033
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034
    int i,j;
1035
    for (i=0; i < height; i++) {
1036
      for (j=0; j < width; j++) {
1037
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038
      }
1039
      src += stride;
1040
      dst += stride;
1041
    }
1042
}
1043

    
1044
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045
    int i,j;
1046
    for (i=0; i < height; i++) {
1047
      for (j=0; j < width; j++) {
1048
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049
      }
1050
      src += stride;
1051
      dst += stride;
1052
    }
1053
}
1054

    
1055
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056
    switch(width){
1057
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1058
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1059
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1060
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1061
    }
1062
}
1063

    
1064
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065
    int i,j;
1066
    for (i=0; i < height; i++) {
1067
      for (j=0; j < width; j++) {
1068
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069
      }
1070
      src += stride;
1071
      dst += stride;
1072
    }
1073
}
1074

    
1075
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076
    int i,j;
1077
    for (i=0; i < height; i++) {
1078
      for (j=0; j < width; j++) {
1079
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080
      }
1081
      src += stride;
1082
      dst += stride;
1083
    }
1084
}
1085
    
1086
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087
    int i,j;
1088
    for (i=0; i < height; i++) {
1089
      for (j=0; j < width; j++) {
1090
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091
      }
1092
      src += stride;
1093
      dst += stride;
1094
    }
1095
}
1096
    
1097
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098
    int i,j;
1099
    for (i=0; i < height; i++) {
1100
      for (j=0; j < width; j++) {
1101
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102
      }
1103
      src += stride;
1104
      dst += stride;
1105
    }
1106
}
1107

    
1108
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109
    int i,j;
1110
    for (i=0; i < height; i++) {
1111
      for (j=0; j < width; j++) {
1112
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113
      }
1114
      src += stride;
1115
      dst += stride;
1116
    }
1117
}
1118

    
1119
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120
    int i,j;
1121
    for (i=0; i < height; i++) {
1122
      for (j=0; j < width; j++) {
1123
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124
      }
1125
      src += stride;
1126
      dst += stride;
1127
    }
1128
}
1129

    
1130
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131
    int i,j;
1132
    for (i=0; i < height; i++) {
1133
      for (j=0; j < width; j++) {
1134
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135
      }
1136
      src += stride;
1137
      dst += stride;
1138
    }
1139
}
1140

    
1141
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142
    int i,j;
1143
    for (i=0; i < height; i++) {
1144
      for (j=0; j < width; j++) {
1145
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146
      }
1147
      src += stride;
1148
      dst += stride;
1149
    }
1150
}
1151
#if 0
1152
#define TPEL_WIDTH(width)\
1153
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171
#endif
1172

    
1173
#define H264_CHROMA_MC(OPNAME, OP)\
1174
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175
    const int A=(8-x)*(8-y);\
1176
    const int B=(  x)*(8-y);\
1177
    const int C=(8-x)*(  y);\
1178
    const int D=(  x)*(  y);\
1179
    int i;\
1180
    \
1181
    assert(x<8 && y<8 && x>=0 && y>=0);\
1182
\
1183
    for(i=0; i<h; i++)\
1184
    {\
1185
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187
        dst+= stride;\
1188
        src+= stride;\
1189
    }\
1190
}\
1191
\
1192
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193
    const int A=(8-x)*(8-y);\
1194
    const int B=(  x)*(8-y);\
1195
    const int C=(8-x)*(  y);\
1196
    const int D=(  x)*(  y);\
1197
    int i;\
1198
    \
1199
    assert(x<8 && y<8 && x>=0 && y>=0);\
1200
\
1201
    for(i=0; i<h; i++)\
1202
    {\
1203
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207
        dst+= stride;\
1208
        src+= stride;\
1209
    }\
1210
}\
1211
\
1212
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213
    const int A=(8-x)*(8-y);\
1214
    const int B=(  x)*(8-y);\
1215
    const int C=(8-x)*(  y);\
1216
    const int D=(  x)*(  y);\
1217
    int i;\
1218
    \
1219
    assert(x<8 && y<8 && x>=0 && y>=0);\
1220
\
1221
    for(i=0; i<h; i++)\
1222
    {\
1223
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231
        dst+= stride;\
1232
        src+= stride;\
1233
    }\
1234
}
1235

    
1236
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237
#define op_put(a, b) a = (((b) + 32)>>6)
1238

    
1239
H264_CHROMA_MC(put_       , op_put)
1240
H264_CHROMA_MC(avg_       , op_avg)
1241
#undef op_avg
1242
#undef op_put
1243

    
1244
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245
{
1246
    int i;
1247
    for(i=0; i<h; i++)
1248
    {
1249
        ST32(dst   , LD32(src   ));
1250
        dst+=dstStride;
1251
        src+=srcStride;
1252
    }
1253
}
1254

    
1255
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256
{
1257
    int i;
1258
    for(i=0; i<h; i++)
1259
    {
1260
        ST32(dst   , LD32(src   ));
1261
        ST32(dst+4 , LD32(src+4 ));
1262
        dst+=dstStride;
1263
        src+=srcStride;
1264
    }
1265
}
1266

    
1267
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268
{
1269
    int i;
1270
    for(i=0; i<h; i++)
1271
    {
1272
        ST32(dst   , LD32(src   ));
1273
        ST32(dst+4 , LD32(src+4 ));
1274
        ST32(dst+8 , LD32(src+8 ));
1275
        ST32(dst+12, LD32(src+12));
1276
        dst+=dstStride;
1277
        src+=srcStride;
1278
    }
1279
}
1280

    
1281
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282
{
1283
    int i;
1284
    for(i=0; i<h; i++)
1285
    {
1286
        ST32(dst   , LD32(src   ));
1287
        ST32(dst+4 , LD32(src+4 ));
1288
        ST32(dst+8 , LD32(src+8 ));
1289
        ST32(dst+12, LD32(src+12));
1290
        dst[16]= src[16];
1291
        dst+=dstStride;
1292
        src+=srcStride;
1293
    }
1294
}
1295

    
1296
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297
{
1298
    int i;
1299
    for(i=0; i<h; i++)
1300
    {
1301
        ST32(dst   , LD32(src   ));
1302
        ST32(dst+4 , LD32(src+4 ));
1303
        dst[8]= src[8];
1304
        dst+=dstStride;
1305
        src+=srcStride;
1306
    }
1307
}
1308

    
1309

    
1310
#define QPEL_MC(r, OPNAME, RND, OP) \
1311
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313
    int i;\
1314
    for(i=0; i<h; i++)\
1315
    {\
1316
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324
        dst+=dstStride;\
1325
        src+=srcStride;\
1326
    }\
1327
}\
1328
\
1329
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330
    const int w=8;\
1331
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332
    int i;\
1333
    for(i=0; i<w; i++)\
1334
    {\
1335
        const int src0= src[0*srcStride];\
1336
        const int src1= src[1*srcStride];\
1337
        const int src2= src[2*srcStride];\
1338
        const int src3= src[3*srcStride];\
1339
        const int src4= src[4*srcStride];\
1340
        const int src5= src[5*srcStride];\
1341
        const int src6= src[6*srcStride];\
1342
        const int src7= src[7*srcStride];\
1343
        const int src8= src[8*srcStride];\
1344
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352
        dst++;\
1353
        src++;\
1354
    }\
1355
}\
1356
\
1357
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359
    int i;\
1360
    \
1361
    for(i=0; i<h; i++)\
1362
    {\
1363
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379
        dst+=dstStride;\
1380
        src+=srcStride;\
1381
    }\
1382
}\
1383
\
1384
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386
    int i;\
1387
    const int w=16;\
1388
    for(i=0; i<w; i++)\
1389
    {\
1390
        const int src0= src[0*srcStride];\
1391
        const int src1= src[1*srcStride];\
1392
        const int src2= src[2*srcStride];\
1393
        const int src3= src[3*srcStride];\
1394
        const int src4= src[4*srcStride];\
1395
        const int src5= src[5*srcStride];\
1396
        const int src6= src[6*srcStride];\
1397
        const int src7= src[7*srcStride];\
1398
        const int src8= src[8*srcStride];\
1399
        const int src9= src[9*srcStride];\
1400
        const int src10= src[10*srcStride];\
1401
        const int src11= src[11*srcStride];\
1402
        const int src12= src[12*srcStride];\
1403
        const int src13= src[13*srcStride];\
1404
        const int src14= src[14*srcStride];\
1405
        const int src15= src[15*srcStride];\
1406
        const int src16= src[16*srcStride];\
1407
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423
        dst++;\
1424
        src++;\
1425
    }\
1426
}\
1427
\
1428
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1430
}\
1431
\
1432
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433
    uint8_t half[64];\
1434
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436
}\
1437
\
1438
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440
}\
1441
\
1442
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443
    uint8_t half[64];\
1444
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446
}\
1447
\
1448
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449
    uint8_t full[16*9];\
1450
    uint8_t half[64];\
1451
    copy_block9(full, src, 16, stride, 9);\
1452
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454
}\
1455
\
1456
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458
    copy_block9(full, src, 16, stride, 9);\
1459
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460
}\
1461
\
1462
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463
    uint8_t full[16*9];\
1464
    uint8_t half[64];\
1465
    copy_block9(full, src, 16, stride, 9);\
1466
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468
}\
1469
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470
    uint8_t full[16*9];\
1471
    uint8_t halfH[72];\
1472
    uint8_t halfV[64];\
1473
    uint8_t halfHV[64];\
1474
    copy_block9(full, src, 16, stride, 9);\
1475
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479
}\
1480
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481
    uint8_t full[16*9];\
1482
    uint8_t halfH[72];\
1483
    uint8_t halfHV[64];\
1484
    copy_block9(full, src, 16, stride, 9);\
1485
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489
}\
1490
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491
    uint8_t full[16*9];\
1492
    uint8_t halfH[72];\
1493
    uint8_t halfV[64];\
1494
    uint8_t halfHV[64];\
1495
    copy_block9(full, src, 16, stride, 9);\
1496
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500
}\
1501
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502
    uint8_t full[16*9];\
1503
    uint8_t halfH[72];\
1504
    uint8_t halfHV[64];\
1505
    copy_block9(full, src, 16, stride, 9);\
1506
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510
}\
1511
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512
    uint8_t full[16*9];\
1513
    uint8_t halfH[72];\
1514
    uint8_t halfV[64];\
1515
    uint8_t halfHV[64];\
1516
    copy_block9(full, src, 16, stride, 9);\
1517
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521
}\
1522
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523
    uint8_t full[16*9];\
1524
    uint8_t halfH[72];\
1525
    uint8_t halfHV[64];\
1526
    copy_block9(full, src, 16, stride, 9);\
1527
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531
}\
1532
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533
    uint8_t full[16*9];\
1534
    uint8_t halfH[72];\
1535
    uint8_t halfV[64];\
1536
    uint8_t halfHV[64];\
1537
    copy_block9(full, src, 16, stride, 9);\
1538
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1539
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542
}\
1543
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544
    uint8_t full[16*9];\
1545
    uint8_t halfH[72];\
1546
    uint8_t halfHV[64];\
1547
    copy_block9(full, src, 16, stride, 9);\
1548
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552
}\
1553
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554
    uint8_t halfH[72];\
1555
    uint8_t halfHV[64];\
1556
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559
}\
1560
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561
    uint8_t halfH[72];\
1562
    uint8_t halfHV[64];\
1563
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566
}\
1567
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568
    uint8_t full[16*9];\
1569
    uint8_t halfH[72];\
1570
    uint8_t halfV[64];\
1571
    uint8_t halfHV[64];\
1572
    copy_block9(full, src, 16, stride, 9);\
1573
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577
}\
1578
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579
    uint8_t full[16*9];\
1580
    uint8_t halfH[72];\
1581
    copy_block9(full, src, 16, stride, 9);\
1582
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585
}\
1586
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587
    uint8_t full[16*9];\
1588
    uint8_t halfH[72];\
1589
    uint8_t halfV[64];\
1590
    uint8_t halfHV[64];\
1591
    copy_block9(full, src, 16, stride, 9);\
1592
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596
}\
1597
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598
    uint8_t full[16*9];\
1599
    uint8_t halfH[72];\
1600
    copy_block9(full, src, 16, stride, 9);\
1601
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604
}\
1605
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606
    uint8_t halfH[72];\
1607
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609
}\
1610
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1612
}\
1613
\
1614
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615
    uint8_t half[256];\
1616
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618
}\
1619
\
1620
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622
}\
1623
\
1624
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625
    uint8_t half[256];\
1626
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628
}\
1629
\
1630
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631
    uint8_t full[24*17];\
1632
    uint8_t half[256];\
1633
    copy_block17(full, src, 24, stride, 17);\
1634
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636
}\
1637
\
1638
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640
    copy_block17(full, src, 24, stride, 17);\
1641
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642
}\
1643
\
1644
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645
    uint8_t full[24*17];\
1646
    uint8_t half[256];\
1647
    copy_block17(full, src, 24, stride, 17);\
1648
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650
}\
1651
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652
    uint8_t full[24*17];\
1653
    uint8_t halfH[272];\
1654
    uint8_t halfV[256];\
1655
    uint8_t halfHV[256];\
1656
    copy_block17(full, src, 24, stride, 17);\
1657
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661
}\
1662
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663
    uint8_t full[24*17];\
1664
    uint8_t halfH[272];\
1665
    uint8_t halfHV[256];\
1666
    copy_block17(full, src, 24, stride, 17);\
1667
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671
}\
1672
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673
    uint8_t full[24*17];\
1674
    uint8_t halfH[272];\
1675
    uint8_t halfV[256];\
1676
    uint8_t halfHV[256];\
1677
    copy_block17(full, src, 24, stride, 17);\
1678
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682
}\
1683
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684
    uint8_t full[24*17];\
1685
    uint8_t halfH[272];\
1686
    uint8_t halfHV[256];\
1687
    copy_block17(full, src, 24, stride, 17);\
1688
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692
}\
1693
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694
    uint8_t full[24*17];\
1695
    uint8_t halfH[272];\
1696
    uint8_t halfV[256];\
1697
    uint8_t halfHV[256];\
1698
    copy_block17(full, src, 24, stride, 17);\
1699
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703
}\
1704
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705
    uint8_t full[24*17];\
1706
    uint8_t halfH[272];\
1707
    uint8_t halfHV[256];\
1708
    copy_block17(full, src, 24, stride, 17);\
1709
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713
}\
1714
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715
    uint8_t full[24*17];\
1716
    uint8_t halfH[272];\
1717
    uint8_t halfV[256];\
1718
    uint8_t halfHV[256];\
1719
    copy_block17(full, src, 24, stride, 17);\
1720
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1721
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724
}\
1725
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t full[24*17];\
1727
    uint8_t halfH[272];\
1728
    uint8_t halfHV[256];\
1729
    copy_block17(full, src, 24, stride, 17);\
1730
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734
}\
1735
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736
    uint8_t halfH[272];\
1737
    uint8_t halfHV[256];\
1738
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741
}\
1742
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743
    uint8_t halfH[272];\
1744
    uint8_t halfHV[256];\
1745
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748
}\
1749
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750
    uint8_t full[24*17];\
1751
    uint8_t halfH[272];\
1752
    uint8_t halfV[256];\
1753
    uint8_t halfHV[256];\
1754
    copy_block17(full, src, 24, stride, 17);\
1755
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759
}\
1760
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761
    uint8_t full[24*17];\
1762
    uint8_t halfH[272];\
1763
    copy_block17(full, src, 24, stride, 17);\
1764
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767
}\
1768
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[24*17];\
1770
    uint8_t halfH[272];\
1771
    uint8_t halfV[256];\
1772
    uint8_t halfHV[256];\
1773
    copy_block17(full, src, 24, stride, 17);\
1774
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778
}\
1779
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780
    uint8_t full[24*17];\
1781
    uint8_t halfH[272];\
1782
    copy_block17(full, src, 24, stride, 17);\
1783
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786
}\
1787
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788
    uint8_t halfH[272];\
1789
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791
}
1792

    
1793
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795
#define op_put(a, b) a = cm[((b) + 16)>>5]
1796
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797

    
1798
QPEL_MC(0, put_       , _       , op_put)
1799
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800
QPEL_MC(0, avg_       , _       , op_avg)
1801
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1802
#undef op_avg
1803
#undef op_avg_no_rnd
1804
#undef op_put
1805
#undef op_put_no_rnd
1806

    
1807
#if 1
1808
#define H264_LOWPASS(OPNAME, OP, OP2) \
1809
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810
    const int h=4;\
1811
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812
    int i;\
1813
    for(i=0; i<h; i++)\
1814
    {\
1815
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819
        dst+=dstStride;\
1820
        src+=srcStride;\
1821
    }\
1822
}\
1823
\
1824
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825
    const int w=4;\
1826
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827
    int i;\
1828
    for(i=0; i<w; i++)\
1829
    {\
1830
        const int srcB= src[-2*srcStride];\
1831
        const int srcA= src[-1*srcStride];\
1832
        const int src0= src[0 *srcStride];\
1833
        const int src1= src[1 *srcStride];\
1834
        const int src2= src[2 *srcStride];\
1835
        const int src3= src[3 *srcStride];\
1836
        const int src4= src[4 *srcStride];\
1837
        const int src5= src[5 *srcStride];\
1838
        const int src6= src[6 *srcStride];\
1839
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843
        dst++;\
1844
        src++;\
1845
    }\
1846
}\
1847
\
1848
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849
    const int h=4;\
1850
    const int w=4;\
1851
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852
    int i;\
1853
    src -= 2*srcStride;\
1854
    for(i=0; i<h+5; i++)\
1855
    {\
1856
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860
        tmp+=tmpStride;\
1861
        src+=srcStride;\
1862
    }\
1863
    tmp -= tmpStride*(h+5-2);\
1864
    for(i=0; i<w; i++)\
1865
    {\
1866
        const int tmpB= tmp[-2*tmpStride];\
1867
        const int tmpA= tmp[-1*tmpStride];\
1868
        const int tmp0= tmp[0 *tmpStride];\
1869
        const int tmp1= tmp[1 *tmpStride];\
1870
        const int tmp2= tmp[2 *tmpStride];\
1871
        const int tmp3= tmp[3 *tmpStride];\
1872
        const int tmp4= tmp[4 *tmpStride];\
1873
        const int tmp5= tmp[5 *tmpStride];\
1874
        const int tmp6= tmp[6 *tmpStride];\
1875
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879
        dst++;\
1880
        tmp++;\
1881
    }\
1882
}\
1883
\
1884
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885
    const int h=8;\
1886
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887
    int i;\
1888
    for(i=0; i<h; i++)\
1889
    {\
1890
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898
        dst+=dstStride;\
1899
        src+=srcStride;\
1900
    }\
1901
}\
1902
\
1903
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904
    const int w=8;\
1905
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906
    int i;\
1907
    for(i=0; i<w; i++)\
1908
    {\
1909
        const int srcB= src[-2*srcStride];\
1910
        const int srcA= src[-1*srcStride];\
1911
        const int src0= src[0 *srcStride];\
1912
        const int src1= src[1 *srcStride];\
1913
        const int src2= src[2 *srcStride];\
1914
        const int src3= src[3 *srcStride];\
1915
        const int src4= src[4 *srcStride];\
1916
        const int src5= src[5 *srcStride];\
1917
        const int src6= src[6 *srcStride];\
1918
        const int src7= src[7 *srcStride];\
1919
        const int src8= src[8 *srcStride];\
1920
        const int src9= src[9 *srcStride];\
1921
        const int src10=src[10*srcStride];\
1922
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930
        dst++;\
1931
        src++;\
1932
    }\
1933
}\
1934
\
1935
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936
    const int h=8;\
1937
    const int w=8;\
1938
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939
    int i;\
1940
    src -= 2*srcStride;\
1941
    for(i=0; i<h+5; i++)\
1942
    {\
1943
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951
        tmp+=tmpStride;\
1952
        src+=srcStride;\
1953
    }\
1954
    tmp -= tmpStride*(h+5-2);\
1955
    for(i=0; i<w; i++)\
1956
    {\
1957
        const int tmpB= tmp[-2*tmpStride];\
1958
        const int tmpA= tmp[-1*tmpStride];\
1959
        const int tmp0= tmp[0 *tmpStride];\
1960
        const int tmp1= tmp[1 *tmpStride];\
1961
        const int tmp2= tmp[2 *tmpStride];\
1962
        const int tmp3= tmp[3 *tmpStride];\
1963
        const int tmp4= tmp[4 *tmpStride];\
1964
        const int tmp5= tmp[5 *tmpStride];\
1965
        const int tmp6= tmp[6 *tmpStride];\
1966
        const int tmp7= tmp[7 *tmpStride];\
1967
        const int tmp8= tmp[8 *tmpStride];\
1968
        const int tmp9= tmp[9 *tmpStride];\
1969
        const int tmp10=tmp[10*tmpStride];\
1970
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978
        dst++;\
1979
        tmp++;\
1980
    }\
1981
}\
1982
\
1983
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1985
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986
    src += 8*srcStride;\
1987
    dst += 8*dstStride;\
1988
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1989
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990
}\
1991
\
1992
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1994
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995
    src += 8*srcStride;\
1996
    dst += 8*dstStride;\
1997
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1998
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999
}\
2000
\
2001
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2003
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004
    src += 8*srcStride;\
2005
    tmp += 8*tmpStride;\
2006
    dst += 8*dstStride;\
2007
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2008
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009
}\
2010

    
2011
#define H264_MC(OPNAME, SIZE) \
2012
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014
}\
2015
\
2016
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017
    uint8_t half[SIZE*SIZE];\
2018
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020
}\
2021
\
2022
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024
}\
2025
\
2026
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027
    uint8_t half[SIZE*SIZE];\
2028
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030
}\
2031
\
2032
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[SIZE*(SIZE+5)];\
2034
    uint8_t * const full_mid= full + SIZE*2;\
2035
    uint8_t half[SIZE*SIZE];\
2036
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2037
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039
}\
2040
\
2041
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[SIZE*(SIZE+5)];\
2043
    uint8_t * const full_mid= full + SIZE*2;\
2044
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046
}\
2047
\
2048
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049
    uint8_t full[SIZE*(SIZE+5)];\
2050
    uint8_t * const full_mid= full + SIZE*2;\
2051
    uint8_t half[SIZE*SIZE];\
2052
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055
}\
2056
\
2057
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[SIZE*(SIZE+5)];\
2059
    uint8_t * const full_mid= full + SIZE*2;\
2060
    uint8_t halfH[SIZE*SIZE];\
2061
    uint8_t halfV[SIZE*SIZE];\
2062
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2064
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066
}\
2067
\
2068
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069
    uint8_t full[SIZE*(SIZE+5)];\
2070
    uint8_t * const full_mid= full + SIZE*2;\
2071
    uint8_t halfH[SIZE*SIZE];\
2072
    uint8_t halfV[SIZE*SIZE];\
2073
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2075
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077
}\
2078
\
2079
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080
    uint8_t full[SIZE*(SIZE+5)];\
2081
    uint8_t * const full_mid= full + SIZE*2;\
2082
    uint8_t halfH[SIZE*SIZE];\
2083
    uint8_t halfV[SIZE*SIZE];\
2084
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2086
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088
}\
2089
\
2090
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091
    uint8_t full[SIZE*(SIZE+5)];\
2092
    uint8_t * const full_mid= full + SIZE*2;\
2093
    uint8_t halfH[SIZE*SIZE];\
2094
    uint8_t halfV[SIZE*SIZE];\
2095
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2097
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099
}\
2100
\
2101
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102
    int16_t tmp[SIZE*(SIZE+5)];\
2103
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104
}\
2105
\
2106
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    int16_t tmp[SIZE*(SIZE+5)];\
2108
    uint8_t halfH[SIZE*SIZE];\
2109
    uint8_t halfHV[SIZE*SIZE];\
2110
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113
}\
2114
\
2115
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116
    int16_t tmp[SIZE*(SIZE+5)];\
2117
    uint8_t halfH[SIZE*SIZE];\
2118
    uint8_t halfHV[SIZE*SIZE];\
2119
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122
}\
2123
\
2124
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125
    uint8_t full[SIZE*(SIZE+5)];\
2126
    uint8_t * const full_mid= full + SIZE*2;\
2127
    int16_t tmp[SIZE*(SIZE+5)];\
2128
    uint8_t halfV[SIZE*SIZE];\
2129
    uint8_t halfHV[SIZE*SIZE];\
2130
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2131
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134
}\
2135
\
2136
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137
    uint8_t full[SIZE*(SIZE+5)];\
2138
    uint8_t * const full_mid= full + SIZE*2;\
2139
    int16_t tmp[SIZE*(SIZE+5)];\
2140
    uint8_t halfV[SIZE*SIZE];\
2141
    uint8_t halfHV[SIZE*SIZE];\
2142
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2143
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146
}\
2147

    
2148
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2151
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2153

    
2154
H264_LOWPASS(put_       , op_put, op2_put)
2155
H264_LOWPASS(avg_       , op_avg, op2_avg)
2156
H264_MC(put_, 4)
2157
H264_MC(put_, 8)
2158
H264_MC(put_, 16)
2159
H264_MC(avg_, 4)
2160
H264_MC(avg_, 8)
2161
H264_MC(avg_, 16)
2162

    
2163
#undef op_avg
2164
#undef op_put
2165
#undef op2_avg
2166
#undef op2_put
2167
#endif
2168

    
2169
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171
    int i;
2172

    
2173
    for(i=0; i<h; i++){
2174
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182
        dst+=dstStride;
2183
        src+=srcStride;        
2184
    }
2185
}
2186

    
2187
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189
    int i;
2190

    
2191
    for(i=0; i<w; i++){
2192
        const int src_1= src[ -srcStride];
2193
        const int src0 = src[0          ];
2194
        const int src1 = src[  srcStride];
2195
        const int src2 = src[2*srcStride];
2196
        const int src3 = src[3*srcStride];
2197
        const int src4 = src[4*srcStride];
2198
        const int src5 = src[5*srcStride];
2199
        const int src6 = src[6*srcStride];
2200
        const int src7 = src[7*srcStride];
2201
        const int src8 = src[8*srcStride];
2202
        const int src9 = src[9*srcStride];
2203
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2205
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2206
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2207
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2208
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2209
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2210
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2211
        src++;
2212
        dst++;
2213
    }
2214
}
2215

    
2216
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217
    put_pixels8_c(dst, src, stride, 8);
2218
}
2219

    
2220
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221
    uint8_t half[64];
2222
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224
}
2225

    
2226
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228
}
2229

    
2230
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231
    uint8_t half[64];
2232
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234
}
2235

    
2236
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238
}
2239

    
2240
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241
    uint8_t halfH[88];
2242
    uint8_t halfV[64];
2243
    uint8_t halfHV[64];
2244
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248
}
2249
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250
    uint8_t halfH[88];
2251
    uint8_t halfV[64];
2252
    uint8_t halfHV[64];
2253
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257
}
2258
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259
    uint8_t halfH[88];
2260
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262
}
2263

    
2264

    
2265
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2266
{
2267
    int s, i;
2268

    
2269
    s = 0;
2270
    for(i=0;i<16;i++) {
2271
        s += abs(pix1[0] - pix2[0]);
2272
        s += abs(pix1[1] - pix2[1]);
2273
        s += abs(pix1[2] - pix2[2]);
2274
        s += abs(pix1[3] - pix2[3]);
2275
        s += abs(pix1[4] - pix2[4]);
2276
        s += abs(pix1[5] - pix2[5]);
2277
        s += abs(pix1[6] - pix2[6]);
2278
        s += abs(pix1[7] - pix2[7]);
2279
        s += abs(pix1[8] - pix2[8]);
2280
        s += abs(pix1[9] - pix2[9]);
2281
        s += abs(pix1[10] - pix2[10]);
2282
        s += abs(pix1[11] - pix2[11]);
2283
        s += abs(pix1[12] - pix2[12]);
2284
        s += abs(pix1[13] - pix2[13]);
2285
        s += abs(pix1[14] - pix2[14]);
2286
        s += abs(pix1[15] - pix2[15]);
2287
        pix1 += line_size;
2288
        pix2 += line_size;
2289
    }
2290
    return s;
2291
}
2292

    
2293
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2294
{
2295
    int s, i;
2296

    
2297
    s = 0;
2298
    for(i=0;i<16;i++) {
2299
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2300
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2301
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2302
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2303
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2304
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2305
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2306
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2307
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2308
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2309
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2310
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2311
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2312
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2313
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2314
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2315
        pix1 += line_size;
2316
        pix2 += line_size;
2317
    }
2318
    return s;
2319
}
2320

    
2321
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2322
{
2323
    int s, i;
2324
    uint8_t *pix3 = pix2 + line_size;
2325

    
2326
    s = 0;
2327
    for(i=0;i<16;i++) {
2328
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2329
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2330
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2331
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2332
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2333
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2334
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2335
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2336
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2337
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2338
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2339
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2340
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2341
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2342
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2343
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2344
        pix1 += line_size;
2345
        pix2 += line_size;
2346
        pix3 += line_size;
2347
    }
2348
    return s;
2349
}
2350

    
2351
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2352
{
2353
    int s, i;
2354
    uint8_t *pix3 = pix2 + line_size;
2355

    
2356
    s = 0;
2357
    for(i=0;i<16;i++) {
2358
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2359
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2360
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2361
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2362
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2363
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2364
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2365
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2366
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2367
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2368
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2369
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2370
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2371
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2372
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2373
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2374
        pix1 += line_size;
2375
        pix2 += line_size;
2376
        pix3 += line_size;
2377
    }
2378
    return s;
2379
}
2380

    
2381
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2382
{
2383
    int s, i;
2384

    
2385
    s = 0;
2386
    for(i=0;i<8;i++) {
2387
        s += abs(pix1[0] - pix2[0]);
2388
        s += abs(pix1[1] - pix2[1]);
2389
        s += abs(pix1[2] - pix2[2]);
2390
        s += abs(pix1[3] - pix2[3]);
2391
        s += abs(pix1[4] - pix2[4]);
2392
        s += abs(pix1[5] - pix2[5]);
2393
        s += abs(pix1[6] - pix2[6]);
2394
        s += abs(pix1[7] - pix2[7]);
2395
        pix1 += line_size;
2396
        pix2 += line_size;
2397
    }
2398
    return s;
2399
}
2400

    
2401
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2402
{
2403
    int s, i;
2404

    
2405
    s = 0;
2406
    for(i=0;i<8;i++) {
2407
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2408
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2409
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2410
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2411
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2412
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2413
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2414
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2415
        pix1 += line_size;
2416
        pix2 += line_size;
2417
    }
2418
    return s;
2419
}
2420

    
2421
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2422
{
2423
    int s, i;
2424
    uint8_t *pix3 = pix2 + line_size;
2425

    
2426
    s = 0;
2427
    for(i=0;i<8;i++) {
2428
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2429
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2430
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2431
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2432
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2433
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2434
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2435
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2436
        pix1 += line_size;
2437
        pix2 += line_size;
2438
        pix3 += line_size;
2439
    }
2440
    return s;
2441
}
2442

    
2443
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2444
{
2445
    int s, i;
2446
    uint8_t *pix3 = pix2 + line_size;
2447

    
2448
    s = 0;
2449
    for(i=0;i<8;i++) {
2450
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2451
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2452
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2453
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2454
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2455
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2456
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2457
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2458
        pix1 += line_size;
2459
        pix2 += line_size;
2460
        pix3 += line_size;
2461
    }
2462
    return s;
2463
}
2464

    
2465
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2466
    return pix_abs16x16_c(a,b,stride);
2467
}
2468

    
2469
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2470
    return pix_abs8x8_c(a,b,stride);
2471
}
2472

    
2473
/**
2474
 * permutes an 8x8 block.
2475
 * @param block the block which will be permuted according to the given permutation vector
2476
 * @param permutation the permutation vector
2477
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2478
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2479
 *                  (inverse) permutated to scantable order!
2480
 */
2481
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2482
{
2483
    int i;
2484
    DCTELEM temp[64];
2485
    
2486
    if(last<=0) return;
2487
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2488

    
2489
    for(i=0; i<=last; i++){
2490
        const int j= scantable[i];
2491
        temp[j]= block[j];
2492
        block[j]=0;
2493
    }
2494
    
2495
    for(i=0; i<=last; i++){
2496
        const int j= scantable[i];
2497
        const int perm_j= permutation[j];
2498
        block[perm_j]= temp[j];
2499
    }
2500
}
2501

    
2502
/**
2503
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2504
 */
2505
static void clear_blocks_c(DCTELEM *blocks)
2506
{
2507
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2508
}
2509

    
2510
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2511
    int i;
2512
    for(i=0; i+7<w; i+=8){
2513
        dst[i+0] += src[i+0];
2514
        dst[i+1] += src[i+1];
2515
        dst[i+2] += src[i+2];
2516
        dst[i+3] += src[i+3];
2517
        dst[i+4] += src[i+4];
2518
        dst[i+5] += src[i+5];
2519
        dst[i+6] += src[i+6];
2520
        dst[i+7] += src[i+7];
2521
    }
2522
    for(; i<w; i++)
2523
        dst[i+0] += src[i+0];
2524
}
2525

    
2526
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2527
    int i;
2528
    for(i=0; i+7<w; i+=8){
2529
        dst[i+0] = src1[i+0]-src2[i+0];
2530
        dst[i+1] = src1[i+1]-src2[i+1];
2531
        dst[i+2] = src1[i+2]-src2[i+2];
2532
        dst[i+3] = src1[i+3]-src2[i+3];
2533
        dst[i+4] = src1[i+4]-src2[i+4];
2534
        dst[i+5] = src1[i+5]-src2[i+5];
2535
        dst[i+6] = src1[i+6]-src2[i+6];
2536
        dst[i+7] = src1[i+7]-src2[i+7];
2537
    }
2538
    for(; i<w; i++)
2539
        dst[i+0] = src1[i+0]-src2[i+0];
2540
}
2541

    
2542
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2543
    int i;
2544
    uint8_t l, lt;
2545

    
2546
    l= *left;
2547
    lt= *left_top;
2548

    
2549
    for(i=0; i<w; i++){
2550
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2551
        lt= src1[i];
2552
        l= src2[i];
2553
        dst[i]= l - pred;
2554
    }    
2555

    
2556
    *left= l;
2557
    *left_top= lt;
2558
}
2559

    
2560
#define BUTTERFLY2(o1,o2,i1,i2) \
2561
o1= (i1)+(i2);\
2562
o2= (i1)-(i2);
2563

    
2564
#define BUTTERFLY1(x,y) \
2565
{\
2566
    int a,b;\
2567
    a= x;\
2568
    b= y;\
2569
    x= a+b;\
2570
    y= a-b;\
2571
}
2572

    
2573
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2574

    
2575
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2576
    int i;
2577
    int temp[64];
2578
    int sum=0;
2579

    
2580
    for(i=0; i<8; i++){
2581
        //FIXME try pointer walks
2582
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2583
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2584
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2585
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2586
        
2587
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2588
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2589
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2590
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2591
        
2592
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2593
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2594
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2595
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2596
    }
2597

    
2598
    for(i=0; i<8; i++){
2599
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2600
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2601
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2602
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2603
        
2604
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2605
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2606
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2607
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2608

    
2609
        sum += 
2610
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2611
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2612
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2613
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2614
    }
2615
#if 0
2616
static int maxi=0;
2617
if(sum>maxi){
2618
    maxi=sum;
2619
    printf("MAX:%d\n", maxi);
2620
}
2621
#endif
2622
    return sum;
2623
}
2624

    
2625
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2626
    int i;
2627
    int temp[64];
2628
    int sum=0;
2629
//FIXME OOOPS ignore 0 term instead of mean mess
2630
    for(i=0; i<8; i++){
2631
        //FIXME try pointer walks
2632
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2633
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2634
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2635
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2636
        
2637
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2638
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2639
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2640
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2641
        
2642
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2643
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2644
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2645
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2646
    }
2647

    
2648
    for(i=0; i<8; i++){
2649
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2650
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2651
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2652
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2653
        
2654
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2655
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2656
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2657
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2658
    
2659
        sum += 
2660
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2661
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2662
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2663
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2664
    }
2665
    
2666
    return sum;
2667
}
2668

    
2669
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2670
    MpegEncContext * const s= (MpegEncContext *)c;
2671
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2672
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2673
    int sum=0, i;
2674

    
2675
    s->dsp.diff_pixels(temp, src1, src2, stride);
2676
    s->dsp.fdct(temp);
2677

    
2678
    for(i=0; i<64; i++)
2679
        sum+= ABS(temp[i]);
2680
        
2681
    return sum;
2682
}
2683

    
2684
void simple_idct(DCTELEM *block); //FIXME
2685

    
2686
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2687
    MpegEncContext * const s= (MpegEncContext *)c;
2688
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2689
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2690
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2691
    int sum=0, i;
2692

    
2693
    s->mb_intra=0;
2694
    
2695
    s->dsp.diff_pixels(temp, src1, src2, stride);
2696
    
2697
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2698
    
2699
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2700
    s->dct_unquantize(s, temp, 0, s->qscale);
2701
    simple_idct(temp); //FIXME 
2702
    
2703
    for(i=0; i<64; i++)
2704
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2705
        
2706
    return sum;
2707
}
2708

    
2709
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2710
    MpegEncContext * const s= (MpegEncContext *)c;
2711
    const uint8_t *scantable= s->intra_scantable.permutated;
2712
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2713
    uint64_t __align8 aligned_bak[stride];
2714
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2715
    uint8_t * const bak= (uint8_t*)aligned_bak;
2716
    int i, last, run, bits, level, distoration, start_i;
2717
    const int esc_length= s->ac_esc_length;
2718
    uint8_t * length;
2719
    uint8_t * last_length;
2720
    
2721
    for(i=0; i<8; i++){
2722
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2723
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2724
    }
2725

    
2726
    s->dsp.diff_pixels(temp, src1, src2, stride);
2727

    
2728
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2729

    
2730
    bits=0;
2731
    
2732
    if (s->mb_intra) {
2733
        start_i = 1; 
2734
        length     = s->intra_ac_vlc_length;
2735
        last_length= s->intra_ac_vlc_last_length;
2736
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2737
    } else {
2738
        start_i = 0;
2739
        length     = s->inter_ac_vlc_length;
2740
        last_length= s->inter_ac_vlc_last_length;
2741
    }
2742
    
2743
    if(last>=start_i){
2744
        run=0;
2745
        for(i=start_i; i<last; i++){
2746
            int j= scantable[i];
2747
            level= temp[j];
2748
        
2749
            if(level){
2750
                level+=64;
2751
                if((level&(~127)) == 0){
2752
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2753
                }else
2754
                    bits+= esc_length;
2755
                run=0;
2756
            }else
2757
                run++;
2758
        }
2759
        i= scantable[last];
2760
       
2761
        level= temp[i] + 64;
2762

    
2763
        assert(level - 64);
2764
        
2765
        if((level&(~127)) == 0){
2766
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2767
        }else
2768
            bits+= esc_length;
2769
    
2770
    }
2771

    
2772
    if(last>=0){
2773
        s->dct_unquantize(s, temp, 0, s->qscale);
2774
    }
2775
    
2776
    s->dsp.idct_add(bak, stride, temp);
2777
    
2778
    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2779

    
2780
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2781
}
2782

    
2783
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2784
    MpegEncContext * const s= (MpegEncContext *)c;
2785
    const uint8_t *scantable= s->intra_scantable.permutated;
2786
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2788
    int i, last, run, bits, level, start_i;
2789
    const int esc_length= s->ac_esc_length;
2790
    uint8_t * length;
2791
    uint8_t * last_length;
2792
    
2793
    s->dsp.diff_pixels(temp, src1, src2, stride);
2794

    
2795
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2796

    
2797
    bits=0;
2798
    
2799
    if (s->mb_intra) {
2800
        start_i = 1; 
2801
        length     = s->intra_ac_vlc_length;
2802
        last_length= s->intra_ac_vlc_last_length;
2803
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2804
    } else {
2805
        start_i = 0;
2806
        length     = s->inter_ac_vlc_length;
2807
        last_length= s->inter_ac_vlc_last_length;
2808
    }
2809
    
2810
    if(last>=start_i){
2811
        run=0;
2812
        for(i=start_i; i<last; i++){
2813
            int j= scantable[i];
2814
            level= temp[j];
2815
        
2816
            if(level){
2817
                level+=64;
2818
                if((level&(~127)) == 0){
2819
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2820
                }else
2821
                    bits+= esc_length;
2822
                run=0;
2823
            }else
2824
                run++;
2825
        }
2826
        i= scantable[last];
2827
                
2828
        level= temp[i] + 64;
2829
        
2830
        assert(level - 64);
2831
        
2832
        if((level&(~127)) == 0){
2833
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2834
        }else
2835
            bits+= esc_length;
2836
    }
2837

    
2838
    return bits;
2839
}
2840

    
2841

    
2842
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2843
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2844
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2845
WARPER88_1616(rd8x8_c, rd16x16_c)
2846
WARPER88_1616(bit8x8_c, bit16x16_c)
2847

    
2848
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2849
 converted */
2850
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2851
{
2852
    j_rev_dct (block);
2853
    put_pixels_clamped_c(block, dest, line_size);
2854
}
2855
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2856
{
2857
    j_rev_dct (block);
2858
    add_pixels_clamped_c(block, dest, line_size);
2859
}
2860

    
2861
/* init static data */
2862
void dsputil_static_init(void)
2863
{
2864
    int i;
2865

    
2866
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2867
    for(i=0;i<MAX_NEG_CROP;i++) {
2868
        cropTbl[i] = 0;
2869
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
2870
    }
2871
    
2872
    for(i=0;i<512;i++) {
2873
        squareTbl[i] = (i - 256) * (i - 256);
2874
    }
2875
    
2876
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2877
}
2878

    
2879

    
2880
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2881
{
2882
    int i;
2883

    
2884
#ifdef CONFIG_ENCODERS
2885
    if(avctx->dct_algo==FF_DCT_FASTINT) {
2886
        c->fdct = fdct_ifast;
2887
        c->fdct248 = fdct_ifast248;
2888
    } 
2889
    else if(avctx->dct_algo==FF_DCT_FAAN) {
2890
        c->fdct = ff_faandct;
2891
        c->fdct248 = ff_faandct248; 
2892
    } 
2893
    else {
2894
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2895
        c->fdct248 = ff_fdct248_islow;
2896
    }
2897
#endif //CONFIG_ENCODERS
2898

    
2899
    if(avctx->idct_algo==FF_IDCT_INT){
2900
        c->idct_put= ff_jref_idct_put;
2901
        c->idct_add= ff_jref_idct_add;
2902
        c->idct    = j_rev_dct;
2903
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2904
    }else{ //accurate/default
2905
        c->idct_put= simple_idct_put;
2906
        c->idct_add= simple_idct_add;
2907
        c->idct    = simple_idct;
2908
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2909
    }
2910

    
2911
    c->get_pixels = get_pixels_c;
2912
    c->diff_pixels = diff_pixels_c;
2913
    c->put_pixels_clamped = put_pixels_clamped_c;
2914
    c->add_pixels_clamped = add_pixels_clamped_c;
2915
    c->gmc1 = gmc1_c;
2916
    c->gmc = gmc_c;
2917
    c->clear_blocks = clear_blocks_c;
2918
    c->pix_sum = pix_sum_c;
2919
    c->pix_norm1 = pix_norm1_c;
2920
    c->sse[0]= sse16_c;
2921
    c->sse[1]= sse8_c;
2922

    
2923
    /* TODO [0] 16  [1] 8 */
2924
    c->pix_abs16x16     = pix_abs16x16_c;
2925
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2926
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2927
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2928
    c->pix_abs8x8     = pix_abs8x8_c;
2929
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2930
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2931
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2932

    
2933
#define dspfunc(PFX, IDX, NUM) \
2934
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2935
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2936
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2937
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2938

    
2939
    dspfunc(put, 0, 16);
2940
    dspfunc(put_no_rnd, 0, 16);
2941
    dspfunc(put, 1, 8);
2942
    dspfunc(put_no_rnd, 1, 8);
2943
    dspfunc(put, 2, 4);
2944
    dspfunc(put, 3, 2);
2945

    
2946
    dspfunc(avg, 0, 16);
2947
    dspfunc(avg_no_rnd, 0, 16);
2948
    dspfunc(avg, 1, 8);
2949
    dspfunc(avg_no_rnd, 1, 8);
2950
    dspfunc(avg, 2, 4);
2951
    dspfunc(avg, 3, 2);
2952
#undef dspfunc
2953

    
2954
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2955
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2956
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2957
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2958
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2959
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2960
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2961
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2962
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2963

    
2964
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2965
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2966
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2967
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2968
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2969
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2970
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2971
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2972
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2973

    
2974
#define dspfunc(PFX, IDX, NUM) \
2975
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2976
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2977
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2978
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2979
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2980
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2981
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2982
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2983
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2984
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2985
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2986
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2987
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2988
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2989
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2990
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2991

    
2992
    dspfunc(put_qpel, 0, 16);
2993
    dspfunc(put_no_rnd_qpel, 0, 16);
2994

    
2995
    dspfunc(avg_qpel, 0, 16);
2996
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2997

    
2998
    dspfunc(put_qpel, 1, 8);
2999
    dspfunc(put_no_rnd_qpel, 1, 8);
3000

    
3001
    dspfunc(avg_qpel, 1, 8);
3002
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3003

    
3004
    dspfunc(put_h264_qpel, 0, 16);
3005
    dspfunc(put_h264_qpel, 1, 8);
3006
    dspfunc(put_h264_qpel, 2, 4);
3007
    dspfunc(avg_h264_qpel, 0, 16);
3008
    dspfunc(avg_h264_qpel, 1, 8);
3009
    dspfunc(avg_h264_qpel, 2, 4);
3010

    
3011
#undef dspfunc
3012
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3013
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3014
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3015
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3016
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3017
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3018

    
3019
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3020
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3021
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3022
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3023
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3024
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3025
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3026
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3027
        
3028
    c->hadamard8_diff[0]= hadamard8_diff16_c;
3029
    c->hadamard8_diff[1]= hadamard8_diff_c;
3030
    c->hadamard8_abs = hadamard8_abs_c;
3031
    
3032
    c->dct_sad[0]= dct_sad16x16_c;
3033
    c->dct_sad[1]= dct_sad8x8_c;
3034
    
3035
    c->sad[0]= sad16x16_c;
3036
    c->sad[1]= sad8x8_c;
3037
    
3038
    c->quant_psnr[0]= quant_psnr16x16_c;
3039
    c->quant_psnr[1]= quant_psnr8x8_c;
3040

    
3041
    c->rd[0]= rd16x16_c;
3042
    c->rd[1]= rd8x8_c;
3043

    
3044
    c->bit[0]= bit16x16_c;
3045
    c->bit[1]= bit8x8_c;
3046
        
3047
    c->add_bytes= add_bytes_c;
3048
    c->diff_bytes= diff_bytes_c;
3049
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3050
    c->bswap_buf= bswap_buf;
3051

    
3052
#ifdef HAVE_MMX
3053
    dsputil_init_mmx(c, avctx);
3054
#endif
3055
#ifdef ARCH_ARMV4L
3056
    dsputil_init_armv4l(c, avctx);
3057
#endif
3058
#ifdef HAVE_MLIB
3059
    dsputil_init_mlib(c, avctx);
3060
#endif
3061
#ifdef ARCH_ALPHA
3062
    dsputil_init_alpha(c, avctx);
3063
#endif
3064
#ifdef ARCH_POWERPC
3065
    dsputil_init_ppc(c, avctx);
3066
#endif
3067
#ifdef HAVE_MMI
3068
    dsputil_init_mmi(c, avctx);
3069
#endif
3070
#ifdef ARCH_SH4
3071
    dsputil_init_sh4(c,avctx);
3072
#endif
3073

    
3074
    switch(c->idct_permutation_type){
3075
    case FF_NO_IDCT_PERM:
3076
        for(i=0; i<64; i++)
3077
            c->idct_permutation[i]= i;
3078
        break;
3079
    case FF_LIBMPEG2_IDCT_PERM:
3080
        for(i=0; i<64; i++)
3081
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3082
        break;
3083
    case FF_SIMPLE_IDCT_PERM:
3084
        for(i=0; i<64; i++)
3085
            c->idct_permutation[i]= simple_mmx_permutation[i];
3086
        break;
3087
    case FF_TRANSPOSE_IDCT_PERM:
3088
        for(i=0; i<64; i++)
3089
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3090
        break;
3091
    default:
3092
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3093
    }
3094
}
3095