Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 983e3246

History | View | Annotate | Download (80.7 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27
#include "avcodec.h"
28
#include "dsputil.h"
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
31

    
32

    
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35

    
36
const uint8_t ff_zigzag_direct[64] = {
37
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39
    12, 19, 26, 33, 40, 48, 41, 34,
40
    27, 20, 13,  6,  7, 14, 21, 28,
41
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46

    
47
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48
uint16_t __align8 inv_zigzag_direct16[64];
49

    
50
const uint8_t ff_alternate_horizontal_scan[64] = {
51
    0,  1,   2,  3,  8,  9, 16, 17, 
52
    10, 11,  4,  5,  6,  7, 15, 14,
53
    13, 12, 19, 18, 24, 25, 32, 33, 
54
    26, 27, 20, 21, 22, 23, 28, 29,
55
    30, 31, 34, 35, 40, 41, 48, 49, 
56
    42, 43, 36, 37, 38, 39, 44, 45,
57
    46, 47, 50, 51, 56, 57, 58, 59, 
58
    52, 53, 54, 55, 60, 61, 62, 63,
59
};
60

    
61
const uint8_t ff_alternate_vertical_scan[64] = {
62
    0,  8,  16, 24,  1,  9,  2, 10, 
63
    17, 25, 32, 40, 48, 56, 57, 49,
64
    41, 33, 26, 18,  3, 11,  4, 12, 
65
    19, 27, 34, 42, 50, 58, 35, 43,
66
    51, 59, 20, 28,  5, 13,  6, 14, 
67
    21, 29, 36, 44, 52, 60, 37, 45,
68
    53, 61, 22, 30,  7, 15, 23, 31, 
69
    38, 46, 54, 62, 39, 47, 55, 63,
70
};
71

    
72
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73
const uint32_t inverse[256]={
74
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
75
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
76
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
77
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
78
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
79
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
80
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
81
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
82
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
83
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
84
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
85
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
86
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
87
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
88
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
89
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
90
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
91
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
92
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
93
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
94
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
95
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
96
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
97
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
98
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
99
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
100
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
101
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
102
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
103
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
104
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
105
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
106
};
107

    
108
/* Input permutation for the simple_idct_mmx */
109
static const uint8_t simple_mmx_permutation[64]={
110
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
111
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
112
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
113
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
114
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
115
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
116
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
117
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118
};
119

    
120
static int pix_sum_c(uint8_t * pix, int line_size)
121
{
122
    int s, i, j;
123

    
124
    s = 0;
125
    for (i = 0; i < 16; i++) {
126
        for (j = 0; j < 16; j += 8) {
127
            s += pix[0];
128
            s += pix[1];
129
            s += pix[2];
130
            s += pix[3];
131
            s += pix[4];
132
            s += pix[5];
133
            s += pix[6];
134
            s += pix[7];
135
            pix += 8;
136
        }
137
        pix += line_size - 16;
138
    }
139
    return s;
140
}
141

    
142
static int pix_norm1_c(uint8_t * pix, int line_size)
143
{
144
    int s, i, j;
145
    uint32_t *sq = squareTbl + 256;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
#if 0
151
            s += sq[pix[0]];
152
            s += sq[pix[1]];
153
            s += sq[pix[2]];
154
            s += sq[pix[3]];
155
            s += sq[pix[4]];
156
            s += sq[pix[5]];
157
            s += sq[pix[6]];
158
            s += sq[pix[7]];
159
#else
160
#if LONG_MAX > 2147483647
161
            register uint64_t x=*(uint64_t*)pix;
162
            s += sq[x&0xff];
163
            s += sq[(x>>8)&0xff];
164
            s += sq[(x>>16)&0xff];
165
            s += sq[(x>>24)&0xff];
166
            s += sq[(x>>32)&0xff];
167
            s += sq[(x>>40)&0xff];
168
            s += sq[(x>>48)&0xff];
169
            s += sq[(x>>56)&0xff];
170
#else
171
            register uint32_t x=*(uint32_t*)pix;
172
            s += sq[x&0xff];
173
            s += sq[(x>>8)&0xff];
174
            s += sq[(x>>16)&0xff];
175
            s += sq[(x>>24)&0xff];
176
            x=*(uint32_t*)(pix+4);
177
            s += sq[x&0xff];
178
            s += sq[(x>>8)&0xff];
179
            s += sq[(x>>16)&0xff];
180
            s += sq[(x>>24)&0xff];
181
#endif
182
#endif
183
            pix += 8;
184
        }
185
        pix += line_size - 16;
186
    }
187
    return s;
188
}
189

    
190

    
191
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
192
{
193
    int s, i;
194
    uint32_t *sq = squareTbl + 256;
195

    
196
    s = 0;
197
    for (i = 0; i < 8; i++) {
198
        s += sq[pix1[0] - pix2[0]];
199
        s += sq[pix1[1] - pix2[1]];
200
        s += sq[pix1[2] - pix2[2]];
201
        s += sq[pix1[3] - pix2[3]];
202
        s += sq[pix1[4] - pix2[4]];
203
        s += sq[pix1[5] - pix2[5]];
204
        s += sq[pix1[6] - pix2[6]];
205
        s += sq[pix1[7] - pix2[7]];
206
        pix1 += line_size;
207
        pix2 += line_size;
208
    }
209
    return s;
210
}
211

    
212
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
213
{
214
    int s, i;
215
    uint32_t *sq = squareTbl + 256;
216

    
217
    s = 0;
218
    for (i = 0; i < 16; i++) {
219
        s += sq[pix1[ 0] - pix2[ 0]];
220
        s += sq[pix1[ 1] - pix2[ 1]];
221
        s += sq[pix1[ 2] - pix2[ 2]];
222
        s += sq[pix1[ 3] - pix2[ 3]];
223
        s += sq[pix1[ 4] - pix2[ 4]];
224
        s += sq[pix1[ 5] - pix2[ 5]];
225
        s += sq[pix1[ 6] - pix2[ 6]];
226
        s += sq[pix1[ 7] - pix2[ 7]];
227
        s += sq[pix1[ 8] - pix2[ 8]];
228
        s += sq[pix1[ 9] - pix2[ 9]];
229
        s += sq[pix1[10] - pix2[10]];
230
        s += sq[pix1[11] - pix2[11]];
231
        s += sq[pix1[12] - pix2[12]];
232
        s += sq[pix1[13] - pix2[13]];
233
        s += sq[pix1[14] - pix2[14]];
234
        s += sq[pix1[15] - pix2[15]];
235

    
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
243
{
244
    int i;
245

    
246
    /* read the pixels */
247
    for(i=0;i<8;i++) {
248
        block[0] = pixels[0];
249
        block[1] = pixels[1];
250
        block[2] = pixels[2];
251
        block[3] = pixels[3];
252
        block[4] = pixels[4];
253
        block[5] = pixels[5];
254
        block[6] = pixels[6];
255
        block[7] = pixels[7];
256
        pixels += line_size;
257
        block += 8;
258
    }
259
}
260

    
261
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262
                          const uint8_t *s2, int stride){
263
    int i;
264

    
265
    /* read the pixels */
266
    for(i=0;i<8;i++) {
267
        block[0] = s1[0] - s2[0];
268
        block[1] = s1[1] - s2[1];
269
        block[2] = s1[2] - s2[2];
270
        block[3] = s1[3] - s2[3];
271
        block[4] = s1[4] - s2[4];
272
        block[5] = s1[5] - s2[5];
273
        block[6] = s1[6] - s2[6];
274
        block[7] = s1[7] - s2[7];
275
        s1 += stride;
276
        s2 += stride;
277
        block += 8;
278
    }
279
}
280

    
281

    
282
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
283
                                 int line_size)
284
{
285
    int i;
286
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
287
    
288
    /* read the pixels */
289
    for(i=0;i<8;i++) {
290
        pixels[0] = cm[block[0]];
291
        pixels[1] = cm[block[1]];
292
        pixels[2] = cm[block[2]];
293
        pixels[3] = cm[block[3]];
294
        pixels[4] = cm[block[4]];
295
        pixels[5] = cm[block[5]];
296
        pixels[6] = cm[block[6]];
297
        pixels[7] = cm[block[7]];
298

    
299
        pixels += line_size;
300
        block += 8;
301
    }
302
}
303

    
304
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
305
                          int line_size)
306
{
307
    int i;
308
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
309
    
310
    /* read the pixels */
311
    for(i=0;i<8;i++) {
312
        pixels[0] = cm[pixels[0] + block[0]];
313
        pixels[1] = cm[pixels[1] + block[1]];
314
        pixels[2] = cm[pixels[2] + block[2]];
315
        pixels[3] = cm[pixels[3] + block[3]];
316
        pixels[4] = cm[pixels[4] + block[4]];
317
        pixels[5] = cm[pixels[5] + block[5]];
318
        pixels[6] = cm[pixels[6] + block[6]];
319
        pixels[7] = cm[pixels[7] + block[7]];
320
        pixels += line_size;
321
        block += 8;
322
    }
323
}
324
#if 0
325

326
#define PIXOP2(OPNAME, OP) \
327
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
328
{\
329
    int i;\
330
    for(i=0; i<h; i++){\
331
        OP(*((uint64_t*)block), LD64(pixels));\
332
        pixels+=line_size;\
333
        block +=line_size;\
334
    }\
335
}\
336
\
337
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
338
{\
339
    int i;\
340
    for(i=0; i<h; i++){\
341
        const uint64_t a= LD64(pixels  );\
342
        const uint64_t b= LD64(pixels+1);\
343
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
344
        pixels+=line_size;\
345
        block +=line_size;\
346
    }\
347
}\
348
\
349
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
350
{\
351
    int i;\
352
    for(i=0; i<h; i++){\
353
        const uint64_t a= LD64(pixels  );\
354
        const uint64_t b= LD64(pixels+1);\
355
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
356
        pixels+=line_size;\
357
        block +=line_size;\
358
    }\
359
}\
360
\
361
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
362
{\
363
    int i;\
364
    for(i=0; i<h; i++){\
365
        const uint64_t a= LD64(pixels          );\
366
        const uint64_t b= LD64(pixels+line_size);\
367
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
368
        pixels+=line_size;\
369
        block +=line_size;\
370
    }\
371
}\
372
\
373
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
374
{\
375
    int i;\
376
    for(i=0; i<h; i++){\
377
        const uint64_t a= LD64(pixels          );\
378
        const uint64_t b= LD64(pixels+line_size);\
379
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380
        pixels+=line_size;\
381
        block +=line_size;\
382
    }\
383
}\
384
\
385
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
386
{\
387
        int i;\
388
        const uint64_t a= LD64(pixels  );\
389
        const uint64_t b= LD64(pixels+1);\
390
        uint64_t l0=  (a&0x0303030303030303ULL)\
391
                    + (b&0x0303030303030303ULL)\
392
                    + 0x0202020202020202ULL;\
393
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395
        uint64_t l1,h1;\
396
\
397
        pixels+=line_size;\
398
        for(i=0; i<h; i+=2){\
399
            uint64_t a= LD64(pixels  );\
400
            uint64_t b= LD64(pixels+1);\
401
            l1=  (a&0x0303030303030303ULL)\
402
               + (b&0x0303030303030303ULL);\
403
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
406
            pixels+=line_size;\
407
            block +=line_size;\
408
            a= LD64(pixels  );\
409
            b= LD64(pixels+1);\
410
            l0=  (a&0x0303030303030303ULL)\
411
               + (b&0x0303030303030303ULL)\
412
               + 0x0202020202020202ULL;\
413
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
416
            pixels+=line_size;\
417
            block +=line_size;\
418
        }\
419
}\
420
\
421
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
422
{\
423
        int i;\
424
        const uint64_t a= LD64(pixels  );\
425
        const uint64_t b= LD64(pixels+1);\
426
        uint64_t l0=  (a&0x0303030303030303ULL)\
427
                    + (b&0x0303030303030303ULL)\
428
                    + 0x0101010101010101ULL;\
429
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
431
        uint64_t l1,h1;\
432
\
433
        pixels+=line_size;\
434
        for(i=0; i<h; i+=2){\
435
            uint64_t a= LD64(pixels  );\
436
            uint64_t b= LD64(pixels+1);\
437
            l1=  (a&0x0303030303030303ULL)\
438
               + (b&0x0303030303030303ULL);\
439
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
442
            pixels+=line_size;\
443
            block +=line_size;\
444
            a= LD64(pixels  );\
445
            b= LD64(pixels+1);\
446
            l0=  (a&0x0303030303030303ULL)\
447
               + (b&0x0303030303030303ULL)\
448
               + 0x0101010101010101ULL;\
449
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452
            pixels+=line_size;\
453
            block +=line_size;\
454
        }\
455
}\
456
\
457
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
458
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
464

465
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466
#else // 64 bit variant
467

    
468
#define PIXOP2(OPNAME, OP) \
469
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
470
    int i;\
471
    for(i=0; i<h; i++){\
472
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
473
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
474
        pixels+=line_size;\
475
        block +=line_size;\
476
    }\
477
}\
478
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
479
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
480
}\
481
\
482
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
483
                                                int src_stride1, int src_stride2, int h){\
484
    int i;\
485
    for(i=0; i<h; i++){\
486
        uint32_t a,b;\
487
        a= LD32(&src1[i*src_stride1  ]);\
488
        b= LD32(&src2[i*src_stride2  ]);\
489
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
490
        a= LD32(&src1[i*src_stride1+4]);\
491
        b= LD32(&src2[i*src_stride2+4]);\
492
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
493
    }\
494
}\
495
\
496
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
497
                                                int src_stride1, int src_stride2, int h){\
498
    int i;\
499
    for(i=0; i<h; i++){\
500
        uint32_t a,b;\
501
        a= LD32(&src1[i*src_stride1  ]);\
502
        b= LD32(&src2[i*src_stride2  ]);\
503
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
504
        a= LD32(&src1[i*src_stride1+4]);\
505
        b= LD32(&src2[i*src_stride2+4]);\
506
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
507
    }\
508
}\
509
\
510
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
511
                                                int src_stride1, int src_stride2, int h){\
512
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
513
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
514
}\
515
\
516
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
517
                                                int src_stride1, int src_stride2, int h){\
518
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
519
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
520
}\
521
\
522
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
523
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
524
}\
525
\
526
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
527
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
528
}\
529
\
530
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
531
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
532
}\
533
\
534
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
535
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
536
}\
537
\
538
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
539
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
540
    int i;\
541
    for(i=0; i<h; i++){\
542
        uint32_t a, b, c, d, l0, l1, h0, h1;\
543
        a= LD32(&src1[i*src_stride1]);\
544
        b= LD32(&src2[i*src_stride2]);\
545
        c= LD32(&src3[i*src_stride3]);\
546
        d= LD32(&src4[i*src_stride4]);\
547
        l0=  (a&0x03030303UL)\
548
           + (b&0x03030303UL)\
549
           + 0x02020202UL;\
550
        h0= ((a&0xFCFCFCFCUL)>>2)\
551
          + ((b&0xFCFCFCFCUL)>>2);\
552
        l1=  (c&0x03030303UL)\
553
           + (d&0x03030303UL);\
554
        h1= ((c&0xFCFCFCFCUL)>>2)\
555
          + ((d&0xFCFCFCFCUL)>>2);\
556
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
557
        a= LD32(&src1[i*src_stride1+4]);\
558
        b= LD32(&src2[i*src_stride2+4]);\
559
        c= LD32(&src3[i*src_stride3+4]);\
560
        d= LD32(&src4[i*src_stride4+4]);\
561
        l0=  (a&0x03030303UL)\
562
           + (b&0x03030303UL)\
563
           + 0x02020202UL;\
564
        h0= ((a&0xFCFCFCFCUL)>>2)\
565
          + ((b&0xFCFCFCFCUL)>>2);\
566
        l1=  (c&0x03030303UL)\
567
           + (d&0x03030303UL);\
568
        h1= ((c&0xFCFCFCFCUL)>>2)\
569
          + ((d&0xFCFCFCFCUL)>>2);\
570
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
571
    }\
572
}\
573
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
574
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
575
    int i;\
576
    for(i=0; i<h; i++){\
577
        uint32_t a, b, c, d, l0, l1, h0, h1;\
578
        a= LD32(&src1[i*src_stride1]);\
579
        b= LD32(&src2[i*src_stride2]);\
580
        c= LD32(&src3[i*src_stride3]);\
581
        d= LD32(&src4[i*src_stride4]);\
582
        l0=  (a&0x03030303UL)\
583
           + (b&0x03030303UL)\
584
           + 0x01010101UL;\
585
        h0= ((a&0xFCFCFCFCUL)>>2)\
586
          + ((b&0xFCFCFCFCUL)>>2);\
587
        l1=  (c&0x03030303UL)\
588
           + (d&0x03030303UL);\
589
        h1= ((c&0xFCFCFCFCUL)>>2)\
590
          + ((d&0xFCFCFCFCUL)>>2);\
591
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
592
        a= LD32(&src1[i*src_stride1+4]);\
593
        b= LD32(&src2[i*src_stride2+4]);\
594
        c= LD32(&src3[i*src_stride3+4]);\
595
        d= LD32(&src4[i*src_stride4+4]);\
596
        l0=  (a&0x03030303UL)\
597
           + (b&0x03030303UL)\
598
           + 0x01010101UL;\
599
        h0= ((a&0xFCFCFCFCUL)>>2)\
600
          + ((b&0xFCFCFCFCUL)>>2);\
601
        l1=  (c&0x03030303UL)\
602
           + (d&0x03030303UL);\
603
        h1= ((c&0xFCFCFCFCUL)>>2)\
604
          + ((d&0xFCFCFCFCUL)>>2);\
605
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
606
    }\
607
}\
608
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
609
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
610
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
611
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
612
}\
613
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
614
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
615
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
616
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
617
}\
618
\
619
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
620
{\
621
    int j;\
622
    for(j=0; j<2; j++){\
623
        int i;\
624
        const uint32_t a= LD32(pixels  );\
625
        const uint32_t b= LD32(pixels+1);\
626
        uint32_t l0=  (a&0x03030303UL)\
627
                    + (b&0x03030303UL)\
628
                    + 0x02020202UL;\
629
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
630
                   + ((b&0xFCFCFCFCUL)>>2);\
631
        uint32_t l1,h1;\
632
\
633
        pixels+=line_size;\
634
        for(i=0; i<h; i+=2){\
635
            uint32_t a= LD32(pixels  );\
636
            uint32_t b= LD32(pixels+1);\
637
            l1=  (a&0x03030303UL)\
638
               + (b&0x03030303UL);\
639
            h1= ((a&0xFCFCFCFCUL)>>2)\
640
              + ((b&0xFCFCFCFCUL)>>2);\
641
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
642
            pixels+=line_size;\
643
            block +=line_size;\
644
            a= LD32(pixels  );\
645
            b= LD32(pixels+1);\
646
            l0=  (a&0x03030303UL)\
647
               + (b&0x03030303UL)\
648
               + 0x02020202UL;\
649
            h0= ((a&0xFCFCFCFCUL)>>2)\
650
              + ((b&0xFCFCFCFCUL)>>2);\
651
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
652
            pixels+=line_size;\
653
            block +=line_size;\
654
        }\
655
        pixels+=4-line_size*(h+1);\
656
        block +=4-line_size*h;\
657
    }\
658
}\
659
\
660
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
661
{\
662
    int j;\
663
    for(j=0; j<2; j++){\
664
        int i;\
665
        const uint32_t a= LD32(pixels  );\
666
        const uint32_t b= LD32(pixels+1);\
667
        uint32_t l0=  (a&0x03030303UL)\
668
                    + (b&0x03030303UL)\
669
                    + 0x01010101UL;\
670
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
671
                   + ((b&0xFCFCFCFCUL)>>2);\
672
        uint32_t l1,h1;\
673
\
674
        pixels+=line_size;\
675
        for(i=0; i<h; i+=2){\
676
            uint32_t a= LD32(pixels  );\
677
            uint32_t b= LD32(pixels+1);\
678
            l1=  (a&0x03030303UL)\
679
               + (b&0x03030303UL);\
680
            h1= ((a&0xFCFCFCFCUL)>>2)\
681
              + ((b&0xFCFCFCFCUL)>>2);\
682
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
683
            pixels+=line_size;\
684
            block +=line_size;\
685
            a= LD32(pixels  );\
686
            b= LD32(pixels+1);\
687
            l0=  (a&0x03030303UL)\
688
               + (b&0x03030303UL)\
689
               + 0x01010101UL;\
690
            h0= ((a&0xFCFCFCFCUL)>>2)\
691
              + ((b&0xFCFCFCFCUL)>>2);\
692
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
693
            pixels+=line_size;\
694
            block +=line_size;\
695
        }\
696
        pixels+=4-line_size*(h+1);\
697
        block +=4-line_size*h;\
698
    }\
699
}\
700
\
701
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
702
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
703
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
704
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
705
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
706
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
707
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
708
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
709

    
710
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
711
#endif
712
#define op_put(a, b) a = b
713

    
714
PIXOP2(avg, op_avg)
715
PIXOP2(put, op_put)
716
#undef op_avg
717
#undef op_put
718

    
719
#define avg2(a,b) ((a+b+1)>>1)
720
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
721

    
722

    
723
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
724
{
725
    const int A=(16-x16)*(16-y16);
726
    const int B=(   x16)*(16-y16);
727
    const int C=(16-x16)*(   y16);
728
    const int D=(   x16)*(   y16);
729
    int i;
730

    
731
    for(i=0; i<h; i++)
732
    {
733
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
734
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
735
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
736
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
737
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
738
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
739
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
740
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
741
        dst+= stride;
742
        src+= stride;
743
    }
744
}
745

    
746
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
747
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
748
{
749
    int y, vx, vy;
750
    const int s= 1<<shift;
751
    
752
    width--;
753
    height--;
754

    
755
    for(y=0; y<h; y++){
756
        int x;
757

    
758
        vx= ox;
759
        vy= oy;
760
        for(x=0; x<8; x++){ //XXX FIXME optimize
761
            int src_x, src_y, frac_x, frac_y, index;
762

    
763
            src_x= vx>>16;
764
            src_y= vy>>16;
765
            frac_x= src_x&(s-1);
766
            frac_y= src_y&(s-1);
767
            src_x>>=shift;
768
            src_y>>=shift;
769
  
770
            if((unsigned)src_x < width){
771
                if((unsigned)src_y < height){
772
                    index= src_x + src_y*stride;
773
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
774
                                           + src[index       +1]*   frac_x )*(s-frac_y)
775
                                        + (  src[index+stride  ]*(s-frac_x)
776
                                           + src[index+stride+1]*   frac_x )*   frac_y
777
                                        + r)>>(shift*2);
778
                }else{
779
                    index= src_x + clip(src_y, 0, height)*stride;                    
780
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
781
                                          + src[index       +1]*   frac_x )*s
782
                                        + r)>>(shift*2);
783
                }
784
            }else{
785
                if((unsigned)src_y < height){
786
                    index= clip(src_x, 0, width) + src_y*stride;                    
787
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
788
                                           + src[index+stride  ]*   frac_y )*s
789
                                        + r)>>(shift*2);
790
                }else{
791
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
792
                    dst[y*stride + x]=    src[index         ];
793
                }
794
            }
795
            
796
            vx+= dxx;
797
            vy+= dyx;
798
        }
799
        ox += dxy;
800
        oy += dyy;
801
    }
802
}
803

    
804
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
805
{
806
    int i;
807
    for(i=0; i<h; i++)
808
    {
809
        ST32(dst   , LD32(src   ));
810
        ST32(dst+4 , LD32(src+4 ));
811
        ST32(dst+8 , LD32(src+8 ));
812
        ST32(dst+12, LD32(src+12));
813
        dst[16]= src[16];
814
        dst+=dstStride;
815
        src+=srcStride;
816
    }
817
}
818

    
819
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
820
{
821
    int i;
822
    for(i=0; i<h; i++)
823
    {
824
        ST32(dst   , LD32(src   ));
825
        ST32(dst+4 , LD32(src+4 ));
826
        dst[8]= src[8];
827
        dst+=dstStride;
828
        src+=srcStride;
829
    }
830
}
831

    
832

    
833
#define QPEL_MC(r, OPNAME, RND, OP) \
834
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
835
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
836
    int i;\
837
    for(i=0; i<h; i++)\
838
    {\
839
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
840
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
841
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
842
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
843
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
844
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
845
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
846
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
847
        dst+=dstStride;\
848
        src+=srcStride;\
849
    }\
850
}\
851
\
852
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
853
    const int w=8;\
854
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
855
    int i;\
856
    for(i=0; i<w; i++)\
857
    {\
858
        const int src0= src[0*srcStride];\
859
        const int src1= src[1*srcStride];\
860
        const int src2= src[2*srcStride];\
861
        const int src3= src[3*srcStride];\
862
        const int src4= src[4*srcStride];\
863
        const int src5= src[5*srcStride];\
864
        const int src6= src[6*srcStride];\
865
        const int src7= src[7*srcStride];\
866
        const int src8= src[8*srcStride];\
867
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
868
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
869
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
870
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
871
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
872
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
873
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
874
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
875
        dst++;\
876
        src++;\
877
    }\
878
}\
879
\
880
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
881
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
882
    int i;\
883
    \
884
    for(i=0; i<h; i++)\
885
    {\
886
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
887
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
888
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
889
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
890
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
891
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
892
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
893
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
894
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
895
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
896
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
897
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
898
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
899
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
900
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
901
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
902
        dst+=dstStride;\
903
        src+=srcStride;\
904
    }\
905
}\
906
\
907
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
908
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
909
    int i;\
910
    const int w=16;\
911
    for(i=0; i<w; i++)\
912
    {\
913
        const int src0= src[0*srcStride];\
914
        const int src1= src[1*srcStride];\
915
        const int src2= src[2*srcStride];\
916
        const int src3= src[3*srcStride];\
917
        const int src4= src[4*srcStride];\
918
        const int src5= src[5*srcStride];\
919
        const int src6= src[6*srcStride];\
920
        const int src7= src[7*srcStride];\
921
        const int src8= src[8*srcStride];\
922
        const int src9= src[9*srcStride];\
923
        const int src10= src[10*srcStride];\
924
        const int src11= src[11*srcStride];\
925
        const int src12= src[12*srcStride];\
926
        const int src13= src[13*srcStride];\
927
        const int src14= src[14*srcStride];\
928
        const int src15= src[15*srcStride];\
929
        const int src16= src[16*srcStride];\
930
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
931
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
932
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
933
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
934
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
935
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
936
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
937
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
938
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
939
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
940
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
941
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
942
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
943
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
944
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
945
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
946
        dst++;\
947
        src++;\
948
    }\
949
}\
950
\
951
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
952
    OPNAME ## pixels8_c(dst, src, stride, 8);\
953
}\
954
\
955
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
956
    uint8_t half[64];\
957
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
958
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
959
}\
960
\
961
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
962
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
963
}\
964
\
965
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
966
    uint8_t half[64];\
967
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
968
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
969
}\
970
\
971
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
972
    uint8_t full[16*9];\
973
    uint8_t half[64];\
974
    copy_block9(full, src, 16, stride, 9);\
975
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
976
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
977
}\
978
\
979
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
980
    uint8_t full[16*9];\
981
    copy_block9(full, src, 16, stride, 9);\
982
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
983
}\
984
\
985
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
986
    uint8_t full[16*9];\
987
    uint8_t half[64];\
988
    copy_block9(full, src, 16, stride, 9);\
989
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
990
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
991
}\
992
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
993
    uint8_t full[16*9];\
994
    uint8_t halfH[72];\
995
    uint8_t halfV[64];\
996
    uint8_t halfHV[64];\
997
    copy_block9(full, src, 16, stride, 9);\
998
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
999
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1000
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1001
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1002
}\
1003
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1004
    uint8_t full[16*9];\
1005
    uint8_t halfH[72];\
1006
    uint8_t halfHV[64];\
1007
    copy_block9(full, src, 16, stride, 9);\
1008
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1009
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1010
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1012
}\
1013
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1014
    uint8_t full[16*9];\
1015
    uint8_t halfH[72];\
1016
    uint8_t halfV[64];\
1017
    uint8_t halfHV[64];\
1018
    copy_block9(full, src, 16, stride, 9);\
1019
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1020
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1021
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1022
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1023
}\
1024
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1025
    uint8_t full[16*9];\
1026
    uint8_t halfH[72];\
1027
    uint8_t halfHV[64];\
1028
    copy_block9(full, src, 16, stride, 9);\
1029
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1030
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1031
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1032
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1033
}\
1034
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1035
    uint8_t full[16*9];\
1036
    uint8_t halfH[72];\
1037
    uint8_t halfV[64];\
1038
    uint8_t halfHV[64];\
1039
    copy_block9(full, src, 16, stride, 9);\
1040
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1041
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1042
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1043
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1044
}\
1045
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1046
    uint8_t full[16*9];\
1047
    uint8_t halfH[72];\
1048
    uint8_t halfHV[64];\
1049
    copy_block9(full, src, 16, stride, 9);\
1050
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1051
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1052
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1053
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1054
}\
1055
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056
    uint8_t full[16*9];\
1057
    uint8_t halfH[72];\
1058
    uint8_t halfV[64];\
1059
    uint8_t halfHV[64];\
1060
    copy_block9(full, src, 16, stride, 9);\
1061
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1062
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1063
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1065
}\
1066
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1067
    uint8_t full[16*9];\
1068
    uint8_t halfH[72];\
1069
    uint8_t halfHV[64];\
1070
    copy_block9(full, src, 16, stride, 9);\
1071
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1072
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1073
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1075
}\
1076
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1077
    uint8_t halfH[72];\
1078
    uint8_t halfHV[64];\
1079
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1080
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1082
}\
1083
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1084
    uint8_t halfH[72];\
1085
    uint8_t halfHV[64];\
1086
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1089
}\
1090
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1091
    uint8_t full[16*9];\
1092
    uint8_t halfH[72];\
1093
    uint8_t halfV[64];\
1094
    uint8_t halfHV[64];\
1095
    copy_block9(full, src, 16, stride, 9);\
1096
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1097
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1098
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1099
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1100
}\
1101
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1102
    uint8_t full[16*9];\
1103
    uint8_t halfH[72];\
1104
    copy_block9(full, src, 16, stride, 9);\
1105
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1106
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1107
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1108
}\
1109
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1110
    uint8_t full[16*9];\
1111
    uint8_t halfH[72];\
1112
    uint8_t halfV[64];\
1113
    uint8_t halfHV[64];\
1114
    copy_block9(full, src, 16, stride, 9);\
1115
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1116
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1117
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1118
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1119
}\
1120
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1121
    uint8_t full[16*9];\
1122
    uint8_t halfH[72];\
1123
    copy_block9(full, src, 16, stride, 9);\
1124
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1125
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1126
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1127
}\
1128
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1129
    uint8_t halfH[72];\
1130
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1131
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1132
}\
1133
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1134
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1135
}\
1136
\
1137
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1138
    uint8_t half[256];\
1139
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1140
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1141
}\
1142
\
1143
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1144
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1145
}\
1146
\
1147
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1148
    uint8_t half[256];\
1149
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1150
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1151
}\
1152
\
1153
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1154
    uint8_t full[24*17];\
1155
    uint8_t half[256];\
1156
    copy_block17(full, src, 24, stride, 17);\
1157
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1158
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1159
}\
1160
\
1161
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1162
    uint8_t full[24*17];\
1163
    copy_block17(full, src, 24, stride, 17);\
1164
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1165
}\
1166
\
1167
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1168
    uint8_t full[24*17];\
1169
    uint8_t half[256];\
1170
    copy_block17(full, src, 24, stride, 17);\
1171
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1172
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1173
}\
1174
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1175
    uint8_t full[24*17];\
1176
    uint8_t halfH[272];\
1177
    uint8_t halfV[256];\
1178
    uint8_t halfHV[256];\
1179
    copy_block17(full, src, 24, stride, 17);\
1180
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1182
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1184
}\
1185
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1186
    uint8_t full[24*17];\
1187
    uint8_t halfH[272];\
1188
    uint8_t halfHV[256];\
1189
    copy_block17(full, src, 24, stride, 17);\
1190
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1191
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1192
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1193
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1194
}\
1195
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1196
    uint8_t full[24*17];\
1197
    uint8_t halfH[272];\
1198
    uint8_t halfV[256];\
1199
    uint8_t halfHV[256];\
1200
    copy_block17(full, src, 24, stride, 17);\
1201
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1202
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1203
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1204
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1205
}\
1206
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1207
    uint8_t full[24*17];\
1208
    uint8_t halfH[272];\
1209
    uint8_t halfHV[256];\
1210
    copy_block17(full, src, 24, stride, 17);\
1211
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1212
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1213
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1214
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1215
}\
1216
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1217
    uint8_t full[24*17];\
1218
    uint8_t halfH[272];\
1219
    uint8_t halfV[256];\
1220
    uint8_t halfHV[256];\
1221
    copy_block17(full, src, 24, stride, 17);\
1222
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1223
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1224
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1225
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1226
}\
1227
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1228
    uint8_t full[24*17];\
1229
    uint8_t halfH[272];\
1230
    uint8_t halfHV[256];\
1231
    copy_block17(full, src, 24, stride, 17);\
1232
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1233
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1234
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1235
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1236
}\
1237
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1238
    uint8_t full[24*17];\
1239
    uint8_t halfH[272];\
1240
    uint8_t halfV[256];\
1241
    uint8_t halfHV[256];\
1242
    copy_block17(full, src, 24, stride, 17);\
1243
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1244
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1245
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1246
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1247
}\
1248
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1249
    uint8_t full[24*17];\
1250
    uint8_t halfH[272];\
1251
    uint8_t halfHV[256];\
1252
    copy_block17(full, src, 24, stride, 17);\
1253
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1254
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1255
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1256
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1257
}\
1258
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1259
    uint8_t halfH[272];\
1260
    uint8_t halfHV[256];\
1261
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1262
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1263
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1264
}\
1265
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1266
    uint8_t halfH[272];\
1267
    uint8_t halfHV[256];\
1268
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1269
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1270
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1271
}\
1272
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1273
    uint8_t full[24*17];\
1274
    uint8_t halfH[272];\
1275
    uint8_t halfV[256];\
1276
    uint8_t halfHV[256];\
1277
    copy_block17(full, src, 24, stride, 17);\
1278
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1279
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1280
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1281
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1282
}\
1283
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1284
    uint8_t full[24*17];\
1285
    uint8_t halfH[272];\
1286
    copy_block17(full, src, 24, stride, 17);\
1287
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1288
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1289
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1290
}\
1291
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1292
    uint8_t full[24*17];\
1293
    uint8_t halfH[272];\
1294
    uint8_t halfV[256];\
1295
    uint8_t halfHV[256];\
1296
    copy_block17(full, src, 24, stride, 17);\
1297
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1298
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1299
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1300
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1301
}\
1302
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1303
    uint8_t full[24*17];\
1304
    uint8_t halfH[272];\
1305
    copy_block17(full, src, 24, stride, 17);\
1306
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1307
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1308
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1309
}\
1310
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1311
    uint8_t halfH[272];\
1312
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1313
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1314
}
1315

    
1316
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1317
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1318
#define op_put(a, b) a = cm[((b) + 16)>>5]
1319
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1320

    
1321
QPEL_MC(0, put_       , _       , op_put)
1322
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1323
QPEL_MC(0, avg_       , _       , op_avg)
1324
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1325
#undef op_avg
1326
#undef op_avg_no_rnd
1327
#undef op_put
1328
#undef op_put_no_rnd
1329

    
1330
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1331
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
1332
    int i;
1333

    
1334
    for(i=0; i<h; i++){
1335
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1336
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1337
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1338
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1339
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1340
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1341
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1342
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1343
        dst+=dstStride;
1344
        src+=srcStride;        
1345
    }
1346
}
1347

    
1348
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1349
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
1350
    int i;
1351

    
1352
    for(i=0; i<w; i++){
1353
        const int src_1= src[ -srcStride];
1354
        const int src0 = src[0          ];
1355
        const int src1 = src[  srcStride];
1356
        const int src2 = src[2*srcStride];
1357
        const int src3 = src[3*srcStride];
1358
        const int src4 = src[4*srcStride];
1359
        const int src5 = src[5*srcStride];
1360
        const int src6 = src[6*srcStride];
1361
        const int src7 = src[7*srcStride];
1362
        const int src8 = src[8*srcStride];
1363
        const int src9 = src[9*srcStride];
1364
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1365
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1366
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1367
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1368
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1369
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1370
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1371
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1372
        src++;
1373
        dst++;
1374
    }
1375
}
1376

    
1377
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1378
    put_pixels8_c(dst, src, stride, 8);
1379
}
1380

    
1381
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1382
    uint8_t half[64];
1383
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1384
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1385
}
1386

    
1387
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1388
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1389
}
1390

    
1391
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1392
    uint8_t half[64];
1393
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1394
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1395
}
1396

    
1397
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1398
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1399
}
1400

    
1401
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1402
    uint8_t halfH[88];
1403
    uint8_t halfV[64];
1404
    uint8_t halfHV[64];
1405
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1407
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1408
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1409
}
1410
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1411
    uint8_t halfH[88];
1412
    uint8_t halfV[64];
1413
    uint8_t halfHV[64];
1414
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1415
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1416
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1417
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1418
}
1419
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1420
    uint8_t halfH[88];
1421
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1422
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1423
}
1424

    
1425

    
1426
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1427
{
1428
    int s, i;
1429

    
1430
    s = 0;
1431
    for(i=0;i<16;i++) {
1432
        s += abs(pix1[0] - pix2[0]);
1433
        s += abs(pix1[1] - pix2[1]);
1434
        s += abs(pix1[2] - pix2[2]);
1435
        s += abs(pix1[3] - pix2[3]);
1436
        s += abs(pix1[4] - pix2[4]);
1437
        s += abs(pix1[5] - pix2[5]);
1438
        s += abs(pix1[6] - pix2[6]);
1439
        s += abs(pix1[7] - pix2[7]);
1440
        s += abs(pix1[8] - pix2[8]);
1441
        s += abs(pix1[9] - pix2[9]);
1442
        s += abs(pix1[10] - pix2[10]);
1443
        s += abs(pix1[11] - pix2[11]);
1444
        s += abs(pix1[12] - pix2[12]);
1445
        s += abs(pix1[13] - pix2[13]);
1446
        s += abs(pix1[14] - pix2[14]);
1447
        s += abs(pix1[15] - pix2[15]);
1448
        pix1 += line_size;
1449
        pix2 += line_size;
1450
    }
1451
    return s;
1452
}
1453

    
1454
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1455
{
1456
    int s, i;
1457

    
1458
    s = 0;
1459
    for(i=0;i<16;i++) {
1460
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1461
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1462
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1463
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1464
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1465
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1466
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1467
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1468
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1469
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1470
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1471
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1472
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1473
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1474
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1475
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1476
        pix1 += line_size;
1477
        pix2 += line_size;
1478
    }
1479
    return s;
1480
}
1481

    
1482
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1483
{
1484
    int s, i;
1485
    uint8_t *pix3 = pix2 + line_size;
1486

    
1487
    s = 0;
1488
    for(i=0;i<16;i++) {
1489
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1490
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1491
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1492
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1493
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1494
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1495
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1496
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1497
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1498
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1499
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1500
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1501
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1502
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1503
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1504
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1505
        pix1 += line_size;
1506
        pix2 += line_size;
1507
        pix3 += line_size;
1508
    }
1509
    return s;
1510
}
1511

    
1512
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1513
{
1514
    int s, i;
1515
    uint8_t *pix3 = pix2 + line_size;
1516

    
1517
    s = 0;
1518
    for(i=0;i<16;i++) {
1519
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1520
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1521
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1522
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1523
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1524
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1525
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1526
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1527
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1528
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1529
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1530
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1531
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1532
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1533
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1534
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1535
        pix1 += line_size;
1536
        pix2 += line_size;
1537
        pix3 += line_size;
1538
    }
1539
    return s;
1540
}
1541

    
1542
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1543
{
1544
    int s, i;
1545

    
1546
    s = 0;
1547
    for(i=0;i<8;i++) {
1548
        s += abs(pix1[0] - pix2[0]);
1549
        s += abs(pix1[1] - pix2[1]);
1550
        s += abs(pix1[2] - pix2[2]);
1551
        s += abs(pix1[3] - pix2[3]);
1552
        s += abs(pix1[4] - pix2[4]);
1553
        s += abs(pix1[5] - pix2[5]);
1554
        s += abs(pix1[6] - pix2[6]);
1555
        s += abs(pix1[7] - pix2[7]);
1556
        pix1 += line_size;
1557
        pix2 += line_size;
1558
    }
1559
    return s;
1560
}
1561

    
1562
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1563
{
1564
    int s, i;
1565

    
1566
    s = 0;
1567
    for(i=0;i<8;i++) {
1568
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1569
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1570
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1571
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1572
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1573
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1574
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1575
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1576
        pix1 += line_size;
1577
        pix2 += line_size;
1578
    }
1579
    return s;
1580
}
1581

    
1582
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1583
{
1584
    int s, i;
1585
    uint8_t *pix3 = pix2 + line_size;
1586

    
1587
    s = 0;
1588
    for(i=0;i<8;i++) {
1589
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1590
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1591
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1592
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1593
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1594
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1595
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1596
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1597
        pix1 += line_size;
1598
        pix2 += line_size;
1599
        pix3 += line_size;
1600
    }
1601
    return s;
1602
}
1603

    
1604
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1605
{
1606
    int s, i;
1607
    uint8_t *pix3 = pix2 + line_size;
1608

    
1609
    s = 0;
1610
    for(i=0;i<8;i++) {
1611
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1612
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1613
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1614
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1615
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1616
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1617
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1618
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1619
        pix1 += line_size;
1620
        pix2 += line_size;
1621
        pix3 += line_size;
1622
    }
1623
    return s;
1624
}
1625

    
1626
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
1627
    return pix_abs16x16_c(a,b,stride);
1628
}
1629

    
1630
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
1631
    return pix_abs8x8_c(a,b,stride);
1632
}
1633

    
1634
/**
1635
 * permutes an 8x8 block.
1636
 * @param block the block which will be permuted according to the given permutation vector
1637
 * @param permutation the permutation vector
1638
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1639
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
1640
 *                  (inverse) permutated to scantable order!
1641
 */
1642
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1643
{
1644
    int i;
1645
    DCTELEM temp[64];
1646
    
1647
    if(last<=0) return;
1648
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1649

    
1650
    for(i=0; i<=last; i++){
1651
        const int j= scantable[i];
1652
        temp[j]= block[j];
1653
        block[j]=0;
1654
    }
1655
    
1656
    for(i=0; i<=last; i++){
1657
        const int j= scantable[i];
1658
        const int perm_j= permutation[j];
1659
        block[perm_j]= temp[j];
1660
    }
1661
}
1662

    
1663
/**
1664
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
1665
 */
1666
static void clear_blocks_c(DCTELEM *blocks)
1667
{
1668
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1669
}
1670

    
1671
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1672
    int i;
1673
    for(i=0; i+7<w; i+=8){
1674
        dst[i+0] += src[i+0];
1675
        dst[i+1] += src[i+1];
1676
        dst[i+2] += src[i+2];
1677
        dst[i+3] += src[i+3];
1678
        dst[i+4] += src[i+4];
1679
        dst[i+5] += src[i+5];
1680
        dst[i+6] += src[i+6];
1681
        dst[i+7] += src[i+7];
1682
    }
1683
    for(; i<w; i++)
1684
        dst[i+0] += src[i+0];
1685
}
1686

    
1687
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1688
    int i;
1689
    for(i=0; i+7<w; i+=8){
1690
        dst[i+0] = src1[i+0]-src2[i+0];
1691
        dst[i+1] = src1[i+1]-src2[i+1];
1692
        dst[i+2] = src1[i+2]-src2[i+2];
1693
        dst[i+3] = src1[i+3]-src2[i+3];
1694
        dst[i+4] = src1[i+4]-src2[i+4];
1695
        dst[i+5] = src1[i+5]-src2[i+5];
1696
        dst[i+6] = src1[i+6]-src2[i+6];
1697
        dst[i+7] = src1[i+7]-src2[i+7];
1698
    }
1699
    for(; i<w; i++)
1700
        dst[i+0] = src1[i+0]-src2[i+0];
1701
}
1702

    
1703
#define BUTTERFLY2(o1,o2,i1,i2) \
1704
o1= (i1)+(i2);\
1705
o2= (i1)-(i2);
1706

    
1707
#define BUTTERFLY1(x,y) \
1708
{\
1709
    int a,b;\
1710
    a= x;\
1711
    b= y;\
1712
    x= a+b;\
1713
    y= a-b;\
1714
}
1715

    
1716
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
1717

    
1718
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
1719
    int i;
1720
    int temp[64];
1721
    int sum=0;
1722

    
1723
    for(i=0; i<8; i++){
1724
        //FIXME try pointer walks
1725
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1726
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1727
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1728
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1729
        
1730
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1731
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1732
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1733
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1734
        
1735
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1736
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1737
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1738
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1739
    }
1740

    
1741
    for(i=0; i<8; i++){
1742
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1743
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1744
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1745
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1746
        
1747
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1748
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1749
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1750
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1751

    
1752
        sum += 
1753
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1754
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1755
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1756
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1757
    }
1758
#if 0
1759
static int maxi=0;
1760
if(sum>maxi){
1761
    maxi=sum;
1762
    printf("MAX:%d\n", maxi);
1763
}
1764
#endif
1765
    return sum;
1766
}
1767

    
1768
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
1769
    int i;
1770
    int temp[64];
1771
    int sum=0;
1772
//FIXME OOOPS ignore 0 term instead of mean mess
1773
    for(i=0; i<8; i++){
1774
        //FIXME try pointer walks
1775
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
1776
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
1777
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
1778
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
1779
        
1780
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1781
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1782
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1783
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1784
        
1785
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1786
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1787
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1788
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1789
    }
1790

    
1791
    for(i=0; i<8; i++){
1792
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1793
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1794
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1795
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1796
        
1797
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1798
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1799
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1800
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1801
    
1802
        sum += 
1803
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1804
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1805
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1806
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1807
    }
1808
    
1809
    return sum;
1810
}
1811

    
1812
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1813
    MpegEncContext * const s= (MpegEncContext *)c;
1814
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1815
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1816
    int sum=0, i;
1817

    
1818
    s->dsp.diff_pixels(temp, src1, src2, stride);
1819
    s->dsp.fdct(temp);
1820

    
1821
    for(i=0; i<64; i++)
1822
        sum+= ABS(temp[i]);
1823
        
1824
    return sum;
1825
}
1826

    
1827
void simple_idct(DCTELEM *block); //FIXME
1828

    
1829
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1830
    MpegEncContext * const s= (MpegEncContext *)c;
1831
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
1832
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1833
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1834
    int sum=0, i;
1835

    
1836
    s->mb_intra=0;
1837
    
1838
    s->dsp.diff_pixels(temp, src1, src2, stride);
1839
    
1840
    memcpy(bak, temp, 64*sizeof(DCTELEM));
1841
    
1842
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1843
    s->dct_unquantize(s, temp, 0, s->qscale);
1844
    simple_idct(temp); //FIXME 
1845
    
1846
    for(i=0; i<64; i++)
1847
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
1848
        
1849
    return sum;
1850
}
1851

    
1852
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1853
    MpegEncContext * const s= (MpegEncContext *)c;
1854
    const uint8_t *scantable= s->intra_scantable.permutated;
1855
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1856
    uint64_t __align8 aligned_bak[stride];
1857
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1858
    uint8_t * const bak= (uint8_t*)aligned_bak;
1859
    int i, last, run, bits, level, distoration, start_i;
1860
    const int esc_length= s->ac_esc_length;
1861
    uint8_t * length;
1862
    uint8_t * last_length;
1863
    
1864
    for(i=0; i<8; i++){
1865
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
1866
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
1867
    }
1868

    
1869
    s->dsp.diff_pixels(temp, src1, src2, stride);
1870

    
1871
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1872

    
1873
    bits=0;
1874
    
1875
    if (s->mb_intra) {
1876
        start_i = 1; 
1877
        length     = s->intra_ac_vlc_length;
1878
        last_length= s->intra_ac_vlc_last_length;
1879
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
1880
    } else {
1881
        start_i = 0;
1882
        length     = s->inter_ac_vlc_length;
1883
        last_length= s->inter_ac_vlc_last_length;
1884
    }
1885
    
1886
    if(last>=start_i){
1887
        run=0;
1888
        for(i=start_i; i<last; i++){
1889
            int j= scantable[i];
1890
            level= temp[j];
1891
        
1892
            if(level){
1893
                level+=64;
1894
                if((level&(~127)) == 0){
1895
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
1896
                }else
1897
                    bits+= esc_length;
1898
                run=0;
1899
            }else
1900
                run++;
1901
        }
1902
        i= scantable[last];
1903
       
1904
        level= temp[i] + 64;
1905

    
1906
        assert(level - 64);
1907
        
1908
        if((level&(~127)) == 0){
1909
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
1910
        }else
1911
            bits+= esc_length;
1912
    
1913
    }
1914

    
1915
    if(last>=0){
1916
        s->dct_unquantize(s, temp, 0, s->qscale);
1917
    }
1918
    
1919
    s->dsp.idct_add(bak, stride, temp);
1920
    
1921
    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
1922

    
1923
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
1924
}
1925

    
1926
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1927
    MpegEncContext * const s= (MpegEncContext *)c;
1928
    const uint8_t *scantable= s->intra_scantable.permutated;
1929
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1930
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
1931
    int i, last, run, bits, level, start_i;
1932
    const int esc_length= s->ac_esc_length;
1933
    uint8_t * length;
1934
    uint8_t * last_length;
1935
    
1936
    s->dsp.diff_pixels(temp, src1, src2, stride);
1937

    
1938
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1939

    
1940
    bits=0;
1941
    
1942
    if (s->mb_intra) {
1943
        start_i = 1; 
1944
        length     = s->intra_ac_vlc_length;
1945
        last_length= s->intra_ac_vlc_last_length;
1946
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
1947
    } else {
1948
        start_i = 0;
1949
        length     = s->inter_ac_vlc_length;
1950
        last_length= s->inter_ac_vlc_last_length;
1951
    }
1952
    
1953
    if(last>=start_i){
1954
        run=0;
1955
        for(i=start_i; i<last; i++){
1956
            int j= scantable[i];
1957
            level= temp[j];
1958
        
1959
            if(level){
1960
                level+=64;
1961
                if((level&(~127)) == 0){
1962
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
1963
                }else
1964
                    bits+= esc_length;
1965
                run=0;
1966
            }else
1967
                run++;
1968
        }
1969
        i= scantable[last];
1970
                
1971
        level= temp[i] + 64;
1972
        
1973
        assert(level - 64);
1974
        
1975
        if((level&(~127)) == 0){
1976
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
1977
        }else
1978
            bits+= esc_length;
1979
    }
1980

    
1981
    return bits;
1982
}
1983

    
1984

    
1985
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
1986
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
1987
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
1988
WARPER88_1616(rd8x8_c, rd16x16_c)
1989
WARPER88_1616(bit8x8_c, bit16x16_c)
1990

    
1991
/* XXX: those functions should be suppressed ASAP when all IDCTs are
1992
 converted */
1993
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1994
{
1995
    j_rev_dct (block);
1996
    put_pixels_clamped_c(block, dest, line_size);
1997
}
1998
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1999
{
2000
    j_rev_dct (block);
2001
    add_pixels_clamped_c(block, dest, line_size);
2002
}
2003

    
2004
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2005
{
2006
    static int init_done = 0;
2007
    int i;
2008

    
2009
    if (!init_done) {
2010
        for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2011
        for(i=0;i<MAX_NEG_CROP;i++) {
2012
            cropTbl[i] = 0;
2013
            cropTbl[i + MAX_NEG_CROP + 256] = 255;
2014
        }
2015

    
2016
        for(i=0;i<512;i++) {
2017
            squareTbl[i] = (i - 256) * (i - 256);
2018
        }
2019

    
2020
        for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2021

    
2022
        init_done = 1;
2023
    }
2024

    
2025
#ifdef CONFIG_ENCODERS
2026
    if(avctx->dct_algo==FF_DCT_FASTINT)
2027
        c->fdct = fdct_ifast;
2028
    else
2029
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2030
#endif //CONFIG_ENCODERS
2031

    
2032
    if(avctx->idct_algo==FF_IDCT_INT){
2033
        c->idct_put= ff_jref_idct_put;
2034
        c->idct_add= ff_jref_idct_add;
2035
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2036
    }else{ //accurate/default
2037
        c->idct_put= simple_idct_put;
2038
        c->idct_add= simple_idct_add;
2039
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2040
    }
2041

    
2042
    c->get_pixels = get_pixels_c;
2043
    c->diff_pixels = diff_pixels_c;
2044
    c->put_pixels_clamped = put_pixels_clamped_c;
2045
    c->add_pixels_clamped = add_pixels_clamped_c;
2046
    c->gmc1 = gmc1_c;
2047
    c->gmc = gmc_c;
2048
    c->clear_blocks = clear_blocks_c;
2049
    c->pix_sum = pix_sum_c;
2050
    c->pix_norm1 = pix_norm1_c;
2051
    c->sse[0]= sse16_c;
2052
    c->sse[1]= sse8_c;
2053

    
2054
    /* TODO [0] 16  [1] 8 */
2055
    c->pix_abs16x16     = pix_abs16x16_c;
2056
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2057
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2058
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2059
    c->pix_abs8x8     = pix_abs8x8_c;
2060
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2061
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2062
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2063

    
2064
#define dspfunc(PFX, IDX, NUM) \
2065
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2066
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2067
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2068
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2069

    
2070
    dspfunc(put, 0, 16);
2071
    dspfunc(put_no_rnd, 0, 16);
2072
    dspfunc(put, 1, 8);
2073
    dspfunc(put_no_rnd, 1, 8);
2074

    
2075
    dspfunc(avg, 0, 16);
2076
    dspfunc(avg_no_rnd, 0, 16);
2077
    dspfunc(avg, 1, 8);
2078
    dspfunc(avg_no_rnd, 1, 8);
2079
#undef dspfunc
2080

    
2081
#define dspfunc(PFX, IDX, NUM) \
2082
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2083
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2084
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2085
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2086
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2087
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2088
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2089
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2090
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2091
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2092
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2093
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2094
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2095
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2096
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2097
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2098

    
2099
    dspfunc(put_qpel, 0, 16);
2100
    dspfunc(put_no_rnd_qpel, 0, 16);
2101

    
2102
    dspfunc(avg_qpel, 0, 16);
2103
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2104

    
2105
    dspfunc(put_qpel, 1, 8);
2106
    dspfunc(put_no_rnd_qpel, 1, 8);
2107

    
2108
    dspfunc(avg_qpel, 1, 8);
2109
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2110
#undef dspfunc
2111

    
2112
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2113
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2114
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2115
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2116
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2117
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2118
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2119
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2120
    
2121
    c->hadamard8_diff[0]= hadamard8_diff16_c;
2122
    c->hadamard8_diff[1]= hadamard8_diff_c;
2123
    c->hadamard8_abs = hadamard8_abs_c;
2124
    
2125
    c->dct_sad[0]= dct_sad16x16_c;
2126
    c->dct_sad[1]= dct_sad8x8_c;
2127
    
2128
    c->sad[0]= sad16x16_c;
2129
    c->sad[1]= sad8x8_c;
2130
    
2131
    c->quant_psnr[0]= quant_psnr16x16_c;
2132
    c->quant_psnr[1]= quant_psnr8x8_c;
2133

    
2134
    c->rd[0]= rd16x16_c;
2135
    c->rd[1]= rd8x8_c;
2136

    
2137
    c->bit[0]= bit16x16_c;
2138
    c->bit[1]= bit8x8_c;
2139
        
2140
    c->add_bytes= add_bytes_c;
2141
    c->diff_bytes= diff_bytes_c;
2142

    
2143
#ifdef HAVE_MMX
2144
    dsputil_init_mmx(c, avctx);
2145
#endif
2146
#ifdef ARCH_ARMV4L
2147
    dsputil_init_armv4l(c, avctx);
2148
#endif
2149
#ifdef HAVE_MLIB
2150
    dsputil_init_mlib(c, avctx);
2151
#endif
2152
#ifdef ARCH_ALPHA
2153
    dsputil_init_alpha(c, avctx);
2154
#endif
2155
#ifdef ARCH_POWERPC
2156
    dsputil_init_ppc(c, avctx);
2157
#endif
2158
#ifdef HAVE_MMI
2159
    dsputil_init_mmi(c, avctx);
2160
#endif
2161

    
2162
    switch(c->idct_permutation_type){
2163
    case FF_NO_IDCT_PERM:
2164
        for(i=0; i<64; i++)
2165
            c->idct_permutation[i]= i;
2166
        break;
2167
    case FF_LIBMPEG2_IDCT_PERM:
2168
        for(i=0; i<64; i++)
2169
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2170
        break;
2171
    case FF_SIMPLE_IDCT_PERM:
2172
        for(i=0; i<64; i++)
2173
            c->idct_permutation[i]= simple_mmx_permutation[i];
2174
        break;
2175
    case FF_TRANSPOSE_IDCT_PERM:
2176
        for(i=0; i<64; i++)
2177
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2178
        break;
2179
    default:
2180
        fprintf(stderr, "Internal error, IDCT permutation not set\n");
2181
    }
2182
}
2183