Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 11f18faf

History | View | Annotate | Download (58.4 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23

    
24
int ff_bit_exact=0;
25

    
26
UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
27
UINT32 squareTbl[512];
28

    
29
const UINT8 ff_zigzag_direct[64] = {
30
    0,   1,  8, 16,  9,  2,  3, 10,
31
    17, 24, 32, 25, 18, 11,  4,  5,
32
    12, 19, 26, 33, 40, 48, 41, 34,
33
    27, 20, 13,  6,  7, 14, 21, 28,
34
    35, 42, 49, 56, 57, 50, 43, 36,
35
    29, 22, 15, 23, 30, 37, 44, 51,
36
    58, 59, 52, 45, 38, 31, 39, 46,
37
    53, 60, 61, 54, 47, 55, 62, 63
38
};
39

    
40
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
41
UINT16 __align8 inv_zigzag_direct16[64];
42

    
43
const UINT8 ff_alternate_horizontal_scan[64] = {
44
    0,  1,   2,  3,  8,  9, 16, 17, 
45
    10, 11,  4,  5,  6,  7, 15, 14,
46
    13, 12, 19, 18, 24, 25, 32, 33, 
47
    26, 27, 20, 21, 22, 23, 28, 29,
48
    30, 31, 34, 35, 40, 41, 48, 49, 
49
    42, 43, 36, 37, 38, 39, 44, 45,
50
    46, 47, 50, 51, 56, 57, 58, 59, 
51
    52, 53, 54, 55, 60, 61, 62, 63,
52
};
53

    
54
const UINT8 ff_alternate_vertical_scan[64] = {
55
    0,  8,  16, 24,  1,  9,  2, 10, 
56
    17, 25, 32, 40, 48, 56, 57, 49,
57
    41, 33, 26, 18,  3, 11,  4, 12, 
58
    19, 27, 34, 42, 50, 58, 35, 43,
59
    51, 59, 20, 28,  5, 13,  6, 14, 
60
    21, 29, 36, 44, 52, 60, 37, 45,
61
    53, 61, 22, 30,  7, 15, 23, 31, 
62
    38, 46, 54, 62, 39, 47, 55, 63,
63
};
64

    
65
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
66
const UINT32 inverse[256]={
67
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
68
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
69
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
70
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
71
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
72
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
73
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
74
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
75
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
76
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
77
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
78
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
79
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
80
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
81
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
82
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
83
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
84
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
85
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
86
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
87
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
88
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
89
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
90
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
91
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
92
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
93
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
94
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
95
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
96
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
97
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
98
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
99
};
100

    
101
static int pix_sum_c(UINT8 * pix, int line_size)
102
{
103
    int s, i, j;
104

    
105
    s = 0;
106
    for (i = 0; i < 16; i++) {
107
        for (j = 0; j < 16; j += 8) {
108
            s += pix[0];
109
            s += pix[1];
110
            s += pix[2];
111
            s += pix[3];
112
            s += pix[4];
113
            s += pix[5];
114
            s += pix[6];
115
            s += pix[7];
116
            pix += 8;
117
        }
118
        pix += line_size - 16;
119
    }
120
    return s;
121
}
122

    
123
static int pix_norm1_c(UINT8 * pix, int line_size)
124
{
125
    int s, i, j;
126
    UINT32 *sq = squareTbl + 256;
127

    
128
    s = 0;
129
    for (i = 0; i < 16; i++) {
130
        for (j = 0; j < 16; j += 8) {
131
            s += sq[pix[0]];
132
            s += sq[pix[1]];
133
            s += sq[pix[2]];
134
            s += sq[pix[3]];
135
            s += sq[pix[4]];
136
            s += sq[pix[5]];
137
            s += sq[pix[6]];
138
            s += sq[pix[7]];
139
            pix += 8;
140
        }
141
        pix += line_size - 16;
142
    }
143
    return s;
144
}
145

    
146

    
147
static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
148
{
149
    int i;
150

    
151
    /* read the pixels */
152
    for(i=0;i<8;i++) {
153
        block[0] = pixels[0];
154
        block[1] = pixels[1];
155
        block[2] = pixels[2];
156
        block[3] = pixels[3];
157
        block[4] = pixels[4];
158
        block[5] = pixels[5];
159
        block[6] = pixels[6];
160
        block[7] = pixels[7];
161
        pixels += line_size;
162
        block += 8;
163
    }
164
}
165

    
166
static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
167
                          const UINT8 *s2, int stride){
168
    int i;
169

    
170
    /* read the pixels */
171
    for(i=0;i<8;i++) {
172
        block[0] = s1[0] - s2[0];
173
        block[1] = s1[1] - s2[1];
174
        block[2] = s1[2] - s2[2];
175
        block[3] = s1[3] - s2[3];
176
        block[4] = s1[4] - s2[4];
177
        block[5] = s1[5] - s2[5];
178
        block[6] = s1[6] - s2[6];
179
        block[7] = s1[7] - s2[7];
180
        s1 += stride;
181
        s2 += stride;
182
        block += 8;
183
    }
184
}
185

    
186

    
187
static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
188
                                 int line_size)
189
{
190
    int i;
191
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
192
    
193
    /* read the pixels */
194
    for(i=0;i<8;i++) {
195
        pixels[0] = cm[block[0]];
196
        pixels[1] = cm[block[1]];
197
        pixels[2] = cm[block[2]];
198
        pixels[3] = cm[block[3]];
199
        pixels[4] = cm[block[4]];
200
        pixels[5] = cm[block[5]];
201
        pixels[6] = cm[block[6]];
202
        pixels[7] = cm[block[7]];
203

    
204
        pixels += line_size;
205
        block += 8;
206
    }
207
}
208

    
209
static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
210
                          int line_size)
211
{
212
    int i;
213
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
214
    
215
    /* read the pixels */
216
    for(i=0;i<8;i++) {
217
        pixels[0] = cm[pixels[0] + block[0]];
218
        pixels[1] = cm[pixels[1] + block[1]];
219
        pixels[2] = cm[pixels[2] + block[2]];
220
        pixels[3] = cm[pixels[3] + block[3]];
221
        pixels[4] = cm[pixels[4] + block[4]];
222
        pixels[5] = cm[pixels[5] + block[5]];
223
        pixels[6] = cm[pixels[6] + block[6]];
224
        pixels[7] = cm[pixels[7] + block[7]];
225
        pixels += line_size;
226
        block += 8;
227
    }
228
}
229
#if 0
230

231
#define PIXOP2(OPNAME, OP) \
232
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
233
{\
234
    int i;\
235
    for(i=0; i<h; i++){\
236
        OP(*((uint64_t*)block), LD64(pixels));\
237
        pixels+=line_size;\
238
        block +=line_size;\
239
    }\
240
}\
241
\
242
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
243
{\
244
    int i;\
245
    for(i=0; i<h; i++){\
246
        const uint64_t a= LD64(pixels  );\
247
        const uint64_t b= LD64(pixels+1);\
248
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
249
        pixels+=line_size;\
250
        block +=line_size;\
251
    }\
252
}\
253
\
254
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
255
{\
256
    int i;\
257
    for(i=0; i<h; i++){\
258
        const uint64_t a= LD64(pixels  );\
259
        const uint64_t b= LD64(pixels+1);\
260
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
261
        pixels+=line_size;\
262
        block +=line_size;\
263
    }\
264
}\
265
\
266
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
267
{\
268
    int i;\
269
    for(i=0; i<h; i++){\
270
        const uint64_t a= LD64(pixels          );\
271
        const uint64_t b= LD64(pixels+line_size);\
272
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
273
        pixels+=line_size;\
274
        block +=line_size;\
275
    }\
276
}\
277
\
278
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
279
{\
280
    int i;\
281
    for(i=0; i<h; i++){\
282
        const uint64_t a= LD64(pixels          );\
283
        const uint64_t b= LD64(pixels+line_size);\
284
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
285
        pixels+=line_size;\
286
        block +=line_size;\
287
    }\
288
}\
289
\
290
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
291
{\
292
        int i;\
293
        const uint64_t a= LD64(pixels  );\
294
        const uint64_t b= LD64(pixels+1);\
295
        uint64_t l0=  (a&0x0303030303030303ULL)\
296
                    + (b&0x0303030303030303ULL)\
297
                    + 0x0202020202020202ULL;\
298
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
299
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
300
        uint64_t l1,h1;\
301
\
302
        pixels+=line_size;\
303
        for(i=0; i<h; i+=2){\
304
            uint64_t a= LD64(pixels  );\
305
            uint64_t b= LD64(pixels+1);\
306
            l1=  (a&0x0303030303030303ULL)\
307
               + (b&0x0303030303030303ULL);\
308
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
309
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
310
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
311
            pixels+=line_size;\
312
            block +=line_size;\
313
            a= LD64(pixels  );\
314
            b= LD64(pixels+1);\
315
            l0=  (a&0x0303030303030303ULL)\
316
               + (b&0x0303030303030303ULL)\
317
               + 0x0202020202020202ULL;\
318
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
319
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
320
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
321
            pixels+=line_size;\
322
            block +=line_size;\
323
        }\
324
}\
325
\
326
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
327
{\
328
        int i;\
329
        const uint64_t a= LD64(pixels  );\
330
        const uint64_t b= LD64(pixels+1);\
331
        uint64_t l0=  (a&0x0303030303030303ULL)\
332
                    + (b&0x0303030303030303ULL)\
333
                    + 0x0101010101010101ULL;\
334
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336
        uint64_t l1,h1;\
337
\
338
        pixels+=line_size;\
339
        for(i=0; i<h; i+=2){\
340
            uint64_t a= LD64(pixels  );\
341
            uint64_t b= LD64(pixels+1);\
342
            l1=  (a&0x0303030303030303ULL)\
343
               + (b&0x0303030303030303ULL);\
344
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
345
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
346
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
347
            pixels+=line_size;\
348
            block +=line_size;\
349
            a= LD64(pixels  );\
350
            b= LD64(pixels+1);\
351
            l0=  (a&0x0303030303030303ULL)\
352
               + (b&0x0303030303030303ULL)\
353
               + 0x0101010101010101ULL;\
354
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
355
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
356
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
357
            pixels+=line_size;\
358
            block +=line_size;\
359
        }\
360
}\
361
\
362
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
363
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
364
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
365
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
366
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
367
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
368
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
369

370
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
371
#else // 64 bit variant
372

    
373
#define PIXOP2(OPNAME, OP) \
374
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
375
    int i;\
376
    for(i=0; i<h; i++){\
377
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
378
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
379
        pixels+=line_size;\
380
        block +=line_size;\
381
    }\
382
}\
383
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
384
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
385
}\
386
\
387
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
388
                                                int src_stride1, int src_stride2, int h){\
389
    int i;\
390
    for(i=0; i<h; i++){\
391
        uint32_t a,b;\
392
        a= LD32(&src1[i*src_stride1  ]);\
393
        b= LD32(&src2[i*src_stride2  ]);\
394
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
395
        a= LD32(&src1[i*src_stride1+4]);\
396
        b= LD32(&src2[i*src_stride2+4]);\
397
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
398
    }\
399
}\
400
\
401
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
402
                                                int src_stride1, int src_stride2, int h){\
403
    int i;\
404
    for(i=0; i<h; i++){\
405
        uint32_t a,b;\
406
        a= LD32(&src1[i*src_stride1  ]);\
407
        b= LD32(&src2[i*src_stride2  ]);\
408
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
409
        a= LD32(&src1[i*src_stride1+4]);\
410
        b= LD32(&src2[i*src_stride2+4]);\
411
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
412
    }\
413
}\
414
\
415
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
416
                                                int src_stride1, int src_stride2, int h){\
417
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
418
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
419
}\
420
\
421
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
422
                                                int src_stride1, int src_stride2, int h){\
423
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
424
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
425
}\
426
\
427
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
428
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
429
}\
430
\
431
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
432
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
433
}\
434
\
435
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
436
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
437
}\
438
\
439
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
440
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
441
}\
442
\
443
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
444
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
445
    int i;\
446
    for(i=0; i<h; i++){\
447
        uint32_t a, b, c, d, l0, l1, h0, h1;\
448
        a= LD32(&src1[i*src_stride1]);\
449
        b= LD32(&src2[i*src_stride2]);\
450
        c= LD32(&src3[i*src_stride3]);\
451
        d= LD32(&src4[i*src_stride4]);\
452
        l0=  (a&0x03030303UL)\
453
           + (b&0x03030303UL)\
454
           + 0x02020202UL;\
455
        h0= ((a&0xFCFCFCFCUL)>>2)\
456
          + ((b&0xFCFCFCFCUL)>>2);\
457
        l1=  (c&0x03030303UL)\
458
           + (d&0x03030303UL);\
459
        h1= ((c&0xFCFCFCFCUL)>>2)\
460
          + ((d&0xFCFCFCFCUL)>>2);\
461
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
462
        a= LD32(&src1[i*src_stride1+4]);\
463
        b= LD32(&src2[i*src_stride2+4]);\
464
        c= LD32(&src3[i*src_stride3+4]);\
465
        d= LD32(&src4[i*src_stride4+4]);\
466
        l0=  (a&0x03030303UL)\
467
           + (b&0x03030303UL)\
468
           + 0x02020202UL;\
469
        h0= ((a&0xFCFCFCFCUL)>>2)\
470
          + ((b&0xFCFCFCFCUL)>>2);\
471
        l1=  (c&0x03030303UL)\
472
           + (d&0x03030303UL);\
473
        h1= ((c&0xFCFCFCFCUL)>>2)\
474
          + ((d&0xFCFCFCFCUL)>>2);\
475
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
476
    }\
477
}\
478
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
479
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
480
    int i;\
481
    for(i=0; i<h; i++){\
482
        uint32_t a, b, c, d, l0, l1, h0, h1;\
483
        a= LD32(&src1[i*src_stride1]);\
484
        b= LD32(&src2[i*src_stride2]);\
485
        c= LD32(&src3[i*src_stride3]);\
486
        d= LD32(&src4[i*src_stride4]);\
487
        l0=  (a&0x03030303UL)\
488
           + (b&0x03030303UL)\
489
           + 0x01010101UL;\
490
        h0= ((a&0xFCFCFCFCUL)>>2)\
491
          + ((b&0xFCFCFCFCUL)>>2);\
492
        l1=  (c&0x03030303UL)\
493
           + (d&0x03030303UL);\
494
        h1= ((c&0xFCFCFCFCUL)>>2)\
495
          + ((d&0xFCFCFCFCUL)>>2);\
496
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
497
        a= LD32(&src1[i*src_stride1+4]);\
498
        b= LD32(&src2[i*src_stride2+4]);\
499
        c= LD32(&src3[i*src_stride3+4]);\
500
        d= LD32(&src4[i*src_stride4+4]);\
501
        l0=  (a&0x03030303UL)\
502
           + (b&0x03030303UL)\
503
           + 0x01010101UL;\
504
        h0= ((a&0xFCFCFCFCUL)>>2)\
505
          + ((b&0xFCFCFCFCUL)>>2);\
506
        l1=  (c&0x03030303UL)\
507
           + (d&0x03030303UL);\
508
        h1= ((c&0xFCFCFCFCUL)>>2)\
509
          + ((d&0xFCFCFCFCUL)>>2);\
510
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
511
    }\
512
}\
513
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
514
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
515
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
516
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
517
}\
518
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
519
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
520
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
521
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
522
}\
523
\
524
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
525
{\
526
    int j;\
527
    for(j=0; j<2; j++){\
528
        int i;\
529
        const uint32_t a= LD32(pixels  );\
530
        const uint32_t b= LD32(pixels+1);\
531
        uint32_t l0=  (a&0x03030303UL)\
532
                    + (b&0x03030303UL)\
533
                    + 0x02020202UL;\
534
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
535
                   + ((b&0xFCFCFCFCUL)>>2);\
536
        uint32_t l1,h1;\
537
\
538
        pixels+=line_size;\
539
        for(i=0; i<h; i+=2){\
540
            uint32_t a= LD32(pixels  );\
541
            uint32_t b= LD32(pixels+1);\
542
            l1=  (a&0x03030303UL)\
543
               + (b&0x03030303UL);\
544
            h1= ((a&0xFCFCFCFCUL)>>2)\
545
              + ((b&0xFCFCFCFCUL)>>2);\
546
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
547
            pixels+=line_size;\
548
            block +=line_size;\
549
            a= LD32(pixels  );\
550
            b= LD32(pixels+1);\
551
            l0=  (a&0x03030303UL)\
552
               + (b&0x03030303UL)\
553
               + 0x02020202UL;\
554
            h0= ((a&0xFCFCFCFCUL)>>2)\
555
              + ((b&0xFCFCFCFCUL)>>2);\
556
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
557
            pixels+=line_size;\
558
            block +=line_size;\
559
        }\
560
        pixels+=4-line_size*(h+1);\
561
        block +=4-line_size*h;\
562
    }\
563
}\
564
\
565
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
566
{\
567
    int j;\
568
    for(j=0; j<2; j++){\
569
        int i;\
570
        const uint32_t a= LD32(pixels  );\
571
        const uint32_t b= LD32(pixels+1);\
572
        uint32_t l0=  (a&0x03030303UL)\
573
                    + (b&0x03030303UL)\
574
                    + 0x01010101UL;\
575
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
576
                   + ((b&0xFCFCFCFCUL)>>2);\
577
        uint32_t l1,h1;\
578
\
579
        pixels+=line_size;\
580
        for(i=0; i<h; i+=2){\
581
            uint32_t a= LD32(pixels  );\
582
            uint32_t b= LD32(pixels+1);\
583
            l1=  (a&0x03030303UL)\
584
               + (b&0x03030303UL);\
585
            h1= ((a&0xFCFCFCFCUL)>>2)\
586
              + ((b&0xFCFCFCFCUL)>>2);\
587
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
588
            pixels+=line_size;\
589
            block +=line_size;\
590
            a= LD32(pixels  );\
591
            b= LD32(pixels+1);\
592
            l0=  (a&0x03030303UL)\
593
               + (b&0x03030303UL)\
594
               + 0x01010101UL;\
595
            h0= ((a&0xFCFCFCFCUL)>>2)\
596
              + ((b&0xFCFCFCFCUL)>>2);\
597
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
598
            pixels+=line_size;\
599
            block +=line_size;\
600
        }\
601
        pixels+=4-line_size*(h+1);\
602
        block +=4-line_size*h;\
603
    }\
604
}\
605
\
606
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
607
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
608
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
609
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
610
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
611
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
612
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
613
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
614

    
615
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
616
#endif
617
#define op_put(a, b) a = b
618

    
619
PIXOP2(avg, op_avg)
620
PIXOP2(put, op_put)
621
#undef op_avg
622
#undef op_put
623

    
624
#define avg2(a,b) ((a+b+1)>>1)
625
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
626

    
627

    
628
static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
629
{
630
    const int A=(16-x16)*(16-y16);
631
    const int B=(   x16)*(16-y16);
632
    const int C=(16-x16)*(   y16);
633
    const int D=(   x16)*(   y16);
634
    int i;
635

    
636
    for(i=0; i<h; i++)
637
    {
638
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
639
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
640
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
641
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
642
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
643
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
644
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
645
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
646
        dst+= stride;
647
        src+= stride;
648
    }
649
}
650

    
651
static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy, 
652
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
653
{
654
    int y, vx, vy;
655
    const int s= 1<<shift;
656
    
657
    width--;
658
    height--;
659

    
660
    for(y=0; y<h; y++){
661
        int x;
662

    
663
        vx= ox;
664
        vy= oy;
665
        for(x=0; x<8; x++){ //XXX FIXME optimize
666
            int src_x, src_y, frac_x, frac_y, index;
667

    
668
            src_x= vx>>16;
669
            src_y= vy>>16;
670
            frac_x= src_x&(s-1);
671
            frac_y= src_y&(s-1);
672
            src_x>>=shift;
673
            src_y>>=shift;
674
  
675
            if((unsigned)src_x < width){
676
                if((unsigned)src_y < height){
677
                    index= src_x + src_y*stride;
678
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
679
                                           + src[index       +1]*   frac_x )*(s-frac_y)
680
                                        + (  src[index+stride  ]*(s-frac_x)
681
                                           + src[index+stride+1]*   frac_x )*   frac_y
682
                                        + r)>>(shift*2);
683
                }else{
684
                    index= src_x + clip(src_y, 0, height)*stride;                    
685
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
686
                                          + src[index       +1]*   frac_x )*s
687
                                        + r)>>(shift*2);
688
                }
689
            }else{
690
                if((unsigned)src_y < height){
691
                    index= clip(src_x, 0, width) + src_y*stride;                    
692
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
693
                                           + src[index+stride  ]*   frac_y )*s
694
                                        + r)>>(shift*2);
695
                }else{
696
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
697
                    dst[y*stride + x]=    src[index         ];
698
                }
699
            }
700
            
701
            vx+= dxx;
702
            vy+= dyx;
703
        }
704
        ox += dxy;
705
        oy += dyy;
706
    }
707
}
708

    
709
static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
710
{
711
    int i;
712
    for(i=0; i<h; i++)
713
    {
714
        ST32(dst   , LD32(src   ));
715
        ST32(dst+4 , LD32(src+4 ));
716
        ST32(dst+8 , LD32(src+8 ));
717
        ST32(dst+12, LD32(src+12));
718
        dst[16]= src[16];
719
        dst+=dstStride;
720
        src+=srcStride;
721
    }
722
}
723

    
724
static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
725
{
726
    int i;
727
    for(i=0; i<h; i++)
728
    {
729
        ST32(dst   , LD32(src   ));
730
        ST32(dst+4 , LD32(src+4 ));
731
        dst[8]= src[8];
732
        dst+=dstStride;
733
        src+=srcStride;
734
    }
735
}
736

    
737
#define QPEL_MC(r, OPNAME, RND, OP) \
738
static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
739
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
740
    int i;\
741
    for(i=0; i<h; i++)\
742
    {\
743
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
744
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
745
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
746
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
747
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
748
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
749
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
750
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
751
        dst+=dstStride;\
752
        src+=srcStride;\
753
    }\
754
}\
755
\
756
static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
757
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
758
    int i;\
759
    for(i=0; i<w; i++)\
760
    {\
761
        const int src0= src[0*srcStride];\
762
        const int src1= src[1*srcStride];\
763
        const int src2= src[2*srcStride];\
764
        const int src3= src[3*srcStride];\
765
        const int src4= src[4*srcStride];\
766
        const int src5= src[5*srcStride];\
767
        const int src6= src[6*srcStride];\
768
        const int src7= src[7*srcStride];\
769
        const int src8= src[8*srcStride];\
770
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
771
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
772
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
773
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
774
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
775
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
776
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
777
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
778
        dst++;\
779
        src++;\
780
    }\
781
}\
782
\
783
static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
784
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
785
    int i;\
786
    for(i=0; i<h; i++)\
787
    {\
788
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
789
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
790
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
791
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
792
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
793
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
794
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
795
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
796
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
797
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
798
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
799
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
800
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
801
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
802
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
803
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
804
        dst+=dstStride;\
805
        src+=srcStride;\
806
    }\
807
}\
808
\
809
static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
810
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
811
    int i;\
812
    for(i=0; i<w; i++)\
813
    {\
814
        const int src0= src[0*srcStride];\
815
        const int src1= src[1*srcStride];\
816
        const int src2= src[2*srcStride];\
817
        const int src3= src[3*srcStride];\
818
        const int src4= src[4*srcStride];\
819
        const int src5= src[5*srcStride];\
820
        const int src6= src[6*srcStride];\
821
        const int src7= src[7*srcStride];\
822
        const int src8= src[8*srcStride];\
823
        const int src9= src[9*srcStride];\
824
        const int src10= src[10*srcStride];\
825
        const int src11= src[11*srcStride];\
826
        const int src12= src[12*srcStride];\
827
        const int src13= src[13*srcStride];\
828
        const int src14= src[14*srcStride];\
829
        const int src15= src[15*srcStride];\
830
        const int src16= src[16*srcStride];\
831
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
832
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
833
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
834
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
835
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
836
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
837
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
838
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
839
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
840
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
841
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
842
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
843
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
844
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
845
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
846
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
847
        dst++;\
848
        src++;\
849
    }\
850
}\
851
\
852
static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
853
    OPNAME ## pixels8_c(dst, src, stride, 8);\
854
}\
855
\
856
static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
857
    UINT8 half[64];\
858
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
859
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
860
}\
861
\
862
static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
863
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
864
}\
865
\
866
static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
867
    UINT8 half[64];\
868
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
869
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
870
}\
871
\
872
static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
873
    UINT8 full[16*9];\
874
    UINT8 half[64];\
875
    copy_block9(full, src, 16, stride, 9);\
876
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
877
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
878
}\
879
\
880
static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
881
    UINT8 full[16*9];\
882
    copy_block9(full, src, 16, stride, 9);\
883
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
884
}\
885
\
886
static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
887
    UINT8 full[16*9];\
888
    UINT8 half[64];\
889
    copy_block9(full, src, 16, stride, 9);\
890
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
891
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
892
}\
893
static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
894
    UINT8 full[16*9];\
895
    UINT8 halfH[72];\
896
    UINT8 halfV[64];\
897
    UINT8 halfHV[64];\
898
    copy_block9(full, src, 16, stride, 9);\
899
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
900
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
901
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
902
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
903
}\
904
static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
905
    UINT8 full[16*9];\
906
    UINT8 halfH[72];\
907
    UINT8 halfV[64];\
908
    UINT8 halfHV[64];\
909
    copy_block9(full, src, 16, stride, 9);\
910
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
911
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
912
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
913
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
914
}\
915
static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
916
    UINT8 full[16*9];\
917
    UINT8 halfH[72];\
918
    UINT8 halfV[64];\
919
    UINT8 halfHV[64];\
920
    copy_block9(full, src, 16, stride, 9);\
921
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
922
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
923
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
924
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
925
}\
926
static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
927
    UINT8 full[16*9];\
928
    UINT8 halfH[72];\
929
    UINT8 halfV[64];\
930
    UINT8 halfHV[64];\
931
    copy_block9(full, src, 16, stride, 9);\
932
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
933
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
934
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
935
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
936
}\
937
static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
938
    UINT8 halfH[72];\
939
    UINT8 halfHV[64];\
940
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
941
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
942
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
943
}\
944
static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
945
    UINT8 halfH[72];\
946
    UINT8 halfHV[64];\
947
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
948
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
949
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
950
}\
951
static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
952
    UINT8 full[16*9];\
953
    UINT8 halfH[72];\
954
    UINT8 halfV[64];\
955
    UINT8 halfHV[64];\
956
    copy_block9(full, src, 16, stride, 9);\
957
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
958
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
959
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
960
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
961
}\
962
static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
963
    UINT8 full[16*9];\
964
    UINT8 halfH[72];\
965
    UINT8 halfV[64];\
966
    UINT8 halfHV[64];\
967
    copy_block9(full, src, 16, stride, 9);\
968
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
969
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
970
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
971
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
972
}\
973
static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
974
    UINT8 halfH[72];\
975
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
976
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
977
}\
978
static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
979
    OPNAME ## pixels16_c(dst, src, stride, 16);\
980
}\
981
\
982
static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
983
    UINT8 half[256];\
984
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
985
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
986
}\
987
\
988
static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
989
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
990
}\
991
\
992
static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
993
    UINT8 half[256];\
994
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
995
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
996
}\
997
\
998
static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
999
    UINT8 full[24*17];\
1000
    UINT8 half[256];\
1001
    copy_block17(full, src, 24, stride, 17);\
1002
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1003
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1004
}\
1005
\
1006
static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1007
    UINT8 full[24*17];\
1008
    copy_block17(full, src, 24, stride, 17);\
1009
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1010
}\
1011
\
1012
static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1013
    UINT8 full[24*17];\
1014
    UINT8 half[256];\
1015
    copy_block17(full, src, 24, stride, 17);\
1016
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1017
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1018
}\
1019
static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1020
    UINT8 full[24*17];\
1021
    UINT8 halfH[272];\
1022
    UINT8 halfV[256];\
1023
    UINT8 halfHV[256];\
1024
    copy_block17(full, src, 24, stride, 17);\
1025
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1026
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1027
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1028
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1029
}\
1030
static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1031
    UINT8 full[24*17];\
1032
    UINT8 halfH[272];\
1033
    UINT8 halfV[256];\
1034
    UINT8 halfHV[256];\
1035
    copy_block17(full, src, 24, stride, 17);\
1036
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1037
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1039
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1040
}\
1041
static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1042
    UINT8 full[24*17];\
1043
    UINT8 halfH[272];\
1044
    UINT8 halfV[256];\
1045
    UINT8 halfHV[256];\
1046
    copy_block17(full, src, 24, stride, 17);\
1047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1048
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1050
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1051
}\
1052
static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1053
    UINT8 full[24*17];\
1054
    UINT8 halfH[272];\
1055
    UINT8 halfV[256];\
1056
    UINT8 halfHV[256];\
1057
    copy_block17(full, src, 24, stride, 17);\
1058
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1059
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1060
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1061
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1062
}\
1063
static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1064
    UINT8 halfH[272];\
1065
    UINT8 halfHV[256];\
1066
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1067
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1068
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1069
}\
1070
static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1071
    UINT8 halfH[272];\
1072
    UINT8 halfHV[256];\
1073
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1074
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1075
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1076
}\
1077
static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1078
    UINT8 full[24*17];\
1079
    UINT8 halfH[272];\
1080
    UINT8 halfV[256];\
1081
    UINT8 halfHV[256];\
1082
    copy_block17(full, src, 24, stride, 17);\
1083
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1084
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1085
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1086
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1087
}\
1088
static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1089
    UINT8 full[24*17];\
1090
    UINT8 halfH[272];\
1091
    UINT8 halfV[256];\
1092
    UINT8 halfHV[256];\
1093
    copy_block17(full, src, 24, stride, 17);\
1094
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1095
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1096
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1097
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1098
}\
1099
static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1100
    UINT8 halfH[272];\
1101
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1102
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1103
}
1104

    
1105
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1106
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1107
#define op_put(a, b) a = cm[((b) + 16)>>5]
1108
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1109

    
1110
QPEL_MC(0, put_       , _       , op_put)
1111
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1112
QPEL_MC(0, avg_       , _       , op_avg)
1113
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1114
#undef op_avg
1115
#undef op_avg_no_rnd
1116
#undef op_put
1117
#undef op_put_no_rnd
1118

    
1119
static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1120
{
1121
    int s, i;
1122

    
1123
    s = 0;
1124
    for(i=0;i<16;i++) {
1125
        s += abs(pix1[0] - pix2[0]);
1126
        s += abs(pix1[1] - pix2[1]);
1127
        s += abs(pix1[2] - pix2[2]);
1128
        s += abs(pix1[3] - pix2[3]);
1129
        s += abs(pix1[4] - pix2[4]);
1130
        s += abs(pix1[5] - pix2[5]);
1131
        s += abs(pix1[6] - pix2[6]);
1132
        s += abs(pix1[7] - pix2[7]);
1133
        s += abs(pix1[8] - pix2[8]);
1134
        s += abs(pix1[9] - pix2[9]);
1135
        s += abs(pix1[10] - pix2[10]);
1136
        s += abs(pix1[11] - pix2[11]);
1137
        s += abs(pix1[12] - pix2[12]);
1138
        s += abs(pix1[13] - pix2[13]);
1139
        s += abs(pix1[14] - pix2[14]);
1140
        s += abs(pix1[15] - pix2[15]);
1141
        pix1 += line_size;
1142
        pix2 += line_size;
1143
    }
1144
    return s;
1145
}
1146

    
1147
static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148
{
1149
    int s, i;
1150

    
1151
    s = 0;
1152
    for(i=0;i<16;i++) {
1153
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1154
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1155
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1156
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1157
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1158
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1159
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1160
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1161
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1162
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1163
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1164
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1165
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1166
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1167
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1168
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1169
        pix1 += line_size;
1170
        pix2 += line_size;
1171
    }
1172
    return s;
1173
}
1174

    
1175
static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1176
{
1177
    int s, i;
1178
    UINT8 *pix3 = pix2 + line_size;
1179

    
1180
    s = 0;
1181
    for(i=0;i<16;i++) {
1182
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1183
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1184
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1185
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1186
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1187
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1188
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1189
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1190
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1191
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1192
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1193
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1194
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1195
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1196
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1197
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1198
        pix1 += line_size;
1199
        pix2 += line_size;
1200
        pix3 += line_size;
1201
    }
1202
    return s;
1203
}
1204

    
1205
static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1206
{
1207
    int s, i;
1208
    UINT8 *pix3 = pix2 + line_size;
1209

    
1210
    s = 0;
1211
    for(i=0;i<16;i++) {
1212
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1213
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1214
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1215
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1216
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1217
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1218
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1219
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1220
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1221
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1222
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1223
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1224
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1225
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1226
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1227
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1228
        pix1 += line_size;
1229
        pix2 += line_size;
1230
        pix3 += line_size;
1231
    }
1232
    return s;
1233
}
1234

    
1235
static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1236
{
1237
    int s, i;
1238

    
1239
    s = 0;
1240
    for(i=0;i<8;i++) {
1241
        s += abs(pix1[0] - pix2[0]);
1242
        s += abs(pix1[1] - pix2[1]);
1243
        s += abs(pix1[2] - pix2[2]);
1244
        s += abs(pix1[3] - pix2[3]);
1245
        s += abs(pix1[4] - pix2[4]);
1246
        s += abs(pix1[5] - pix2[5]);
1247
        s += abs(pix1[6] - pix2[6]);
1248
        s += abs(pix1[7] - pix2[7]);
1249
        pix1 += line_size;
1250
        pix2 += line_size;
1251
    }
1252
    return s;
1253
}
1254

    
1255
static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1256
{
1257
    int s, i;
1258

    
1259
    s = 0;
1260
    for(i=0;i<8;i++) {
1261
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1262
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1263
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1264
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1265
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1266
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1267
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1268
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1269
        pix1 += line_size;
1270
        pix2 += line_size;
1271
    }
1272
    return s;
1273
}
1274

    
1275
static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1276
{
1277
    int s, i;
1278
    UINT8 *pix3 = pix2 + line_size;
1279

    
1280
    s = 0;
1281
    for(i=0;i<8;i++) {
1282
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1283
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1284
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1285
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1286
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1287
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1288
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1289
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1290
        pix1 += line_size;
1291
        pix2 += line_size;
1292
        pix3 += line_size;
1293
    }
1294
    return s;
1295
}
1296

    
1297
static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1298
{
1299
    int s, i;
1300
    UINT8 *pix3 = pix2 + line_size;
1301

    
1302
    s = 0;
1303
    for(i=0;i<8;i++) {
1304
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1305
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1306
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1307
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1308
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1309
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1310
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1311
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1312
        pix1 += line_size;
1313
        pix2 += line_size;
1314
        pix3 += line_size;
1315
    }
1316
    return s;
1317
}
1318

    
1319
void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
1320
{
1321
    int i;
1322
    INT16 temp[64];
1323
    
1324
    if(last<=0) return;
1325
    if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1326

    
1327
    for(i=0; i<=last; i++){
1328
        const int j= scantable[i];
1329
        temp[j]= block[j];
1330
        block[j]=0;
1331
    }
1332
    
1333
    for(i=0; i<=last; i++){
1334
        const int j= scantable[i];
1335
        const int perm_j= permutation[j];
1336
        block[perm_j]= temp[j];
1337
    }
1338
}
1339

    
1340
static void clear_blocks_c(DCTELEM *blocks)
1341
{
1342
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1343
}
1344

    
1345
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1346
    int i;
1347
    for(i=0; i+7<w; i++){
1348
        dst[i+0] += src[i+0];
1349
        dst[i+1] += src[i+1];
1350
        dst[i+2] += src[i+2];
1351
        dst[i+3] += src[i+3];
1352
        dst[i+4] += src[i+4];
1353
        dst[i+5] += src[i+5];
1354
        dst[i+6] += src[i+6];
1355
        dst[i+7] += src[i+7];
1356
    }
1357
    for(; i<w; i++)
1358
        dst[i+0] += src[i+0];
1359
}
1360

    
1361
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1362
    int i;
1363
    for(i=0; i+7<w; i++){
1364
        dst[i+0] = src1[i+0]-src2[i+0];
1365
        dst[i+1] = src1[i+1]-src2[i+1];
1366
        dst[i+2] = src1[i+2]-src2[i+2];
1367
        dst[i+3] = src1[i+3]-src2[i+3];
1368
        dst[i+4] = src1[i+4]-src2[i+4];
1369
        dst[i+5] = src1[i+5]-src2[i+5];
1370
        dst[i+6] = src1[i+6]-src2[i+6];
1371
        dst[i+7] = src1[i+7]-src2[i+7];
1372
    }
1373
    for(; i<w; i++)
1374
        dst[i+0] = src1[i+0]-src2[i+0];
1375
}
1376

    
1377
void dsputil_init(DSPContext* c, unsigned mask)
1378
{
1379
    static int init_done = 0;
1380
    int i;
1381

    
1382
    if (!init_done) {
1383
        for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1384
        for(i=0;i<MAX_NEG_CROP;i++) {
1385
            cropTbl[i] = 0;
1386
            cropTbl[i + MAX_NEG_CROP + 256] = 255;
1387
        }
1388

    
1389
        for(i=0;i<512;i++) {
1390
            squareTbl[i] = (i - 256) * (i - 256);
1391
        }
1392

    
1393
        for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1394

    
1395
        init_done = 1;
1396
    }
1397

    
1398
    c->get_pixels = get_pixels_c;
1399
    c->diff_pixels = diff_pixels_c;
1400
    c->put_pixels_clamped = put_pixels_clamped_c;
1401
    c->add_pixels_clamped = add_pixels_clamped_c;
1402
    c->gmc1 = gmc1_c;
1403
    c->gmc = gmc_c;
1404
    c->clear_blocks = clear_blocks_c;
1405
    c->pix_sum = pix_sum_c;
1406
    c->pix_norm1 = pix_norm1_c;
1407

    
1408
    /* TODO [0] 16  [1] 8 */
1409
    c->pix_abs16x16     = pix_abs16x16_c;
1410
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
1411
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
1412
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1413
    c->pix_abs8x8     = pix_abs8x8_c;
1414
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
1415
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
1416
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1417

    
1418
#define dspfunc(PFX, IDX, NUM) \
1419
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
1420
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
1421
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
1422
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
1423

    
1424
    dspfunc(put, 0, 16);
1425
    dspfunc(put_no_rnd, 0, 16);
1426
    dspfunc(put, 1, 8);
1427
    dspfunc(put_no_rnd, 1, 8);
1428

    
1429
    dspfunc(avg, 0, 16);
1430
    dspfunc(avg_no_rnd, 0, 16);
1431
    dspfunc(avg, 1, 8);
1432
    dspfunc(avg_no_rnd, 1, 8);
1433
#undef dspfunc
1434

    
1435
#define dspfunc(PFX, IDX, NUM) \
1436
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
1437
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
1438
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
1439
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
1440
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
1441
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
1442
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
1443
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
1444
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
1445
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
1446
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
1447
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
1448
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
1449
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
1450
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
1451
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
1452

    
1453
    dspfunc(put_qpel, 0, 16);
1454
    dspfunc(put_no_rnd_qpel, 0, 16);
1455

    
1456
    dspfunc(avg_qpel, 0, 16);
1457
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
1458

    
1459
    dspfunc(put_qpel, 1, 8);
1460
    dspfunc(put_no_rnd_qpel, 1, 8);
1461

    
1462
    dspfunc(avg_qpel, 1, 8);
1463
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
1464
#undef dspfunc
1465

    
1466
    c->add_bytes= add_bytes_c;
1467
    c->diff_bytes= diff_bytes_c;
1468

    
1469
#ifdef HAVE_MMX
1470
    dsputil_init_mmx(c, mask);
1471
    if (ff_bit_exact)
1472
    {
1473
        /* FIXME - AVCodec context should have flag for bitexact match */
1474
        /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1475
        dsputil_set_bit_exact_mmx(c, mask);
1476
    }
1477
#endif
1478
#ifdef ARCH_ARMV4L
1479
    dsputil_init_armv4l(c, mask);
1480
#endif
1481
#ifdef HAVE_MLIB
1482
    dsputil_init_mlib(c, mask);
1483
#endif
1484
#ifdef ARCH_ALPHA
1485
    dsputil_init_alpha(c, mask);
1486
#endif
1487
#ifdef ARCH_POWERPC
1488
    dsputil_init_ppc(c, mask);
1489
#endif
1490
#ifdef HAVE_MMI
1491
    dsputil_init_mmi(c, mask);
1492
#endif
1493

    
1494
}
1495

    
1496
/* remove any non bit exact operation (testing purpose) */
1497
void avcodec_set_bit_exact(void)
1498
{
1499
    ff_bit_exact=1;
1500
#ifdef HAVE_MMX
1501
// FIXME - better set_bit_exact
1502
//    dsputil_set_bit_exact_mmx();
1503
#endif
1504
}
1505

    
1506
void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1507
              int orig_linesize[3], int coded_linesize,
1508
              AVCodecContext *avctx)
1509
{
1510
    int quad, diff, x, y;
1511
    UINT8 *orig, *coded;
1512
    UINT32 *sq = squareTbl + 256;
1513
    
1514
    quad = 0;
1515
    diff = 0;
1516
    
1517
    /* Luminance */
1518
    orig = orig_image[0];
1519
    coded = coded_image[0];
1520
    
1521
    for (y=0;y<avctx->height;y++) {
1522
        for (x=0;x<avctx->width;x++) {
1523
            diff = *(orig + x) - *(coded + x);
1524
            quad += sq[diff];
1525
        }
1526
        orig += orig_linesize[0];
1527
        coded += coded_linesize;
1528
    }
1529
   
1530
    avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1531
    
1532
    if (avctx->psnr_y) {
1533
        avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1534
        avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1535
    } else
1536
        avctx->psnr_y = 99.99;
1537
}
1538