Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 9c76bd48

History | View | Annotate | Download (59 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23

    
24
int ff_bit_exact=0;
25

    
26
UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
27
UINT32 squareTbl[512];
28

    
29
const UINT8 ff_zigzag_direct[64] = {
30
    0,   1,  8, 16,  9,  2,  3, 10,
31
    17, 24, 32, 25, 18, 11,  4,  5,
32
    12, 19, 26, 33, 40, 48, 41, 34,
33
    27, 20, 13,  6,  7, 14, 21, 28,
34
    35, 42, 49, 56, 57, 50, 43, 36,
35
    29, 22, 15, 23, 30, 37, 44, 51,
36
    58, 59, 52, 45, 38, 31, 39, 46,
37
    53, 60, 61, 54, 47, 55, 62, 63
38
};
39

    
40
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
41
UINT16 __align8 inv_zigzag_direct16[64];
42

    
43
const UINT8 ff_alternate_horizontal_scan[64] = {
44
    0,  1,   2,  3,  8,  9, 16, 17, 
45
    10, 11,  4,  5,  6,  7, 15, 14,
46
    13, 12, 19, 18, 24, 25, 32, 33, 
47
    26, 27, 20, 21, 22, 23, 28, 29,
48
    30, 31, 34, 35, 40, 41, 48, 49, 
49
    42, 43, 36, 37, 38, 39, 44, 45,
50
    46, 47, 50, 51, 56, 57, 58, 59, 
51
    52, 53, 54, 55, 60, 61, 62, 63,
52
};
53

    
54
const UINT8 ff_alternate_vertical_scan[64] = {
55
    0,  8,  16, 24,  1,  9,  2, 10, 
56
    17, 25, 32, 40, 48, 56, 57, 49,
57
    41, 33, 26, 18,  3, 11,  4, 12, 
58
    19, 27, 34, 42, 50, 58, 35, 43,
59
    51, 59, 20, 28,  5, 13,  6, 14, 
60
    21, 29, 36, 44, 52, 60, 37, 45,
61
    53, 61, 22, 30,  7, 15, 23, 31, 
62
    38, 46, 54, 62, 39, 47, 55, 63,
63
};
64

    
65
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
66
const UINT32 inverse[256]={
67
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
68
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
69
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
70
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
71
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
72
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
73
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
74
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
75
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
76
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
77
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
78
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
79
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
80
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
81
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
82
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
83
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
84
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
85
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
86
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
87
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
88
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
89
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
90
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
91
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
92
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
93
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
94
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
95
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
96
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
97
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
98
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
99
};
100

    
101
static int pix_sum_c(UINT8 * pix, int line_size)
102
{
103
    int s, i, j;
104

    
105
    s = 0;
106
    for (i = 0; i < 16; i++) {
107
        for (j = 0; j < 16; j += 8) {
108
            s += pix[0];
109
            s += pix[1];
110
            s += pix[2];
111
            s += pix[3];
112
            s += pix[4];
113
            s += pix[5];
114
            s += pix[6];
115
            s += pix[7];
116
            pix += 8;
117
        }
118
        pix += line_size - 16;
119
    }
120
    return s;
121
}
122

    
123
static int pix_norm1_c(UINT8 * pix, int line_size)
124
{
125
    int s, i, j;
126
    UINT32 *sq = squareTbl + 256;
127

    
128
    s = 0;
129
    for (i = 0; i < 16; i++) {
130
        for (j = 0; j < 16; j += 8) {
131
            s += sq[pix[0]];
132
            s += sq[pix[1]];
133
            s += sq[pix[2]];
134
            s += sq[pix[3]];
135
            s += sq[pix[4]];
136
            s += sq[pix[5]];
137
            s += sq[pix[6]];
138
            s += sq[pix[7]];
139
            pix += 8;
140
        }
141
        pix += line_size - 16;
142
    }
143
    return s;
144
}
145

    
146

    
147
static int pix_norm_c(UINT8 * pix1, UINT8 * pix2, int line_size)
148
{
149
    int s, i, j;
150
    UINT32 *sq = squareTbl + 256;
151

    
152
    s = 0;
153
    for (i = 0; i < 16; i++) {
154
        for (j = 0; j < 16; j += 8) {
155
            s += sq[pix1[0] - pix2[0]];
156
            s += sq[pix1[1] - pix2[1]];
157
            s += sq[pix1[2] - pix2[2]];
158
            s += sq[pix1[3] - pix2[3]];
159
            s += sq[pix1[4] - pix2[4]];
160
            s += sq[pix1[5] - pix2[5]];
161
            s += sq[pix1[6] - pix2[6]];
162
            s += sq[pix1[7] - pix2[7]];
163
            pix1 += 8;
164
            pix2 += 8;
165
        }
166
        pix1 += line_size - 16;
167
        pix2 += line_size - 16;
168
    }
169
    return s;
170
}
171

    
172
static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
173
{
174
    int i;
175

    
176
    /* read the pixels */
177
    for(i=0;i<8;i++) {
178
        block[0] = pixels[0];
179
        block[1] = pixels[1];
180
        block[2] = pixels[2];
181
        block[3] = pixels[3];
182
        block[4] = pixels[4];
183
        block[5] = pixels[5];
184
        block[6] = pixels[6];
185
        block[7] = pixels[7];
186
        pixels += line_size;
187
        block += 8;
188
    }
189
}
190

    
191
static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
192
                          const UINT8 *s2, int stride){
193
    int i;
194

    
195
    /* read the pixels */
196
    for(i=0;i<8;i++) {
197
        block[0] = s1[0] - s2[0];
198
        block[1] = s1[1] - s2[1];
199
        block[2] = s1[2] - s2[2];
200
        block[3] = s1[3] - s2[3];
201
        block[4] = s1[4] - s2[4];
202
        block[5] = s1[5] - s2[5];
203
        block[6] = s1[6] - s2[6];
204
        block[7] = s1[7] - s2[7];
205
        s1 += stride;
206
        s2 += stride;
207
        block += 8;
208
    }
209
}
210

    
211

    
212
static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
213
                                 int line_size)
214
{
215
    int i;
216
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
217
    
218
    /* read the pixels */
219
    for(i=0;i<8;i++) {
220
        pixels[0] = cm[block[0]];
221
        pixels[1] = cm[block[1]];
222
        pixels[2] = cm[block[2]];
223
        pixels[3] = cm[block[3]];
224
        pixels[4] = cm[block[4]];
225
        pixels[5] = cm[block[5]];
226
        pixels[6] = cm[block[6]];
227
        pixels[7] = cm[block[7]];
228

    
229
        pixels += line_size;
230
        block += 8;
231
    }
232
}
233

    
234
static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
235
                          int line_size)
236
{
237
    int i;
238
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
239
    
240
    /* read the pixels */
241
    for(i=0;i<8;i++) {
242
        pixels[0] = cm[pixels[0] + block[0]];
243
        pixels[1] = cm[pixels[1] + block[1]];
244
        pixels[2] = cm[pixels[2] + block[2]];
245
        pixels[3] = cm[pixels[3] + block[3]];
246
        pixels[4] = cm[pixels[4] + block[4]];
247
        pixels[5] = cm[pixels[5] + block[5]];
248
        pixels[6] = cm[pixels[6] + block[6]];
249
        pixels[7] = cm[pixels[7] + block[7]];
250
        pixels += line_size;
251
        block += 8;
252
    }
253
}
254
#if 0
255

256
#define PIXOP2(OPNAME, OP) \
257
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
258
{\
259
    int i;\
260
    for(i=0; i<h; i++){\
261
        OP(*((uint64_t*)block), LD64(pixels));\
262
        pixels+=line_size;\
263
        block +=line_size;\
264
    }\
265
}\
266
\
267
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
268
{\
269
    int i;\
270
    for(i=0; i<h; i++){\
271
        const uint64_t a= LD64(pixels  );\
272
        const uint64_t b= LD64(pixels+1);\
273
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
274
        pixels+=line_size;\
275
        block +=line_size;\
276
    }\
277
}\
278
\
279
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
280
{\
281
    int i;\
282
    for(i=0; i<h; i++){\
283
        const uint64_t a= LD64(pixels  );\
284
        const uint64_t b= LD64(pixels+1);\
285
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
286
        pixels+=line_size;\
287
        block +=line_size;\
288
    }\
289
}\
290
\
291
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
292
{\
293
    int i;\
294
    for(i=0; i<h; i++){\
295
        const uint64_t a= LD64(pixels          );\
296
        const uint64_t b= LD64(pixels+line_size);\
297
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
298
        pixels+=line_size;\
299
        block +=line_size;\
300
    }\
301
}\
302
\
303
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
304
{\
305
    int i;\
306
    for(i=0; i<h; i++){\
307
        const uint64_t a= LD64(pixels          );\
308
        const uint64_t b= LD64(pixels+line_size);\
309
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
310
        pixels+=line_size;\
311
        block +=line_size;\
312
    }\
313
}\
314
\
315
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
316
{\
317
        int i;\
318
        const uint64_t a= LD64(pixels  );\
319
        const uint64_t b= LD64(pixels+1);\
320
        uint64_t l0=  (a&0x0303030303030303ULL)\
321
                    + (b&0x0303030303030303ULL)\
322
                    + 0x0202020202020202ULL;\
323
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
324
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
325
        uint64_t l1,h1;\
326
\
327
        pixels+=line_size;\
328
        for(i=0; i<h; i+=2){\
329
            uint64_t a= LD64(pixels  );\
330
            uint64_t b= LD64(pixels+1);\
331
            l1=  (a&0x0303030303030303ULL)\
332
               + (b&0x0303030303030303ULL);\
333
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
334
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
335
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
336
            pixels+=line_size;\
337
            block +=line_size;\
338
            a= LD64(pixels  );\
339
            b= LD64(pixels+1);\
340
            l0=  (a&0x0303030303030303ULL)\
341
               + (b&0x0303030303030303ULL)\
342
               + 0x0202020202020202ULL;\
343
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
344
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
345
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
346
            pixels+=line_size;\
347
            block +=line_size;\
348
        }\
349
}\
350
\
351
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
352
{\
353
        int i;\
354
        const uint64_t a= LD64(pixels  );\
355
        const uint64_t b= LD64(pixels+1);\
356
        uint64_t l0=  (a&0x0303030303030303ULL)\
357
                    + (b&0x0303030303030303ULL)\
358
                    + 0x0101010101010101ULL;\
359
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
360
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
361
        uint64_t l1,h1;\
362
\
363
        pixels+=line_size;\
364
        for(i=0; i<h; i+=2){\
365
            uint64_t a= LD64(pixels  );\
366
            uint64_t b= LD64(pixels+1);\
367
            l1=  (a&0x0303030303030303ULL)\
368
               + (b&0x0303030303030303ULL);\
369
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
370
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
371
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
372
            pixels+=line_size;\
373
            block +=line_size;\
374
            a= LD64(pixels  );\
375
            b= LD64(pixels+1);\
376
            l0=  (a&0x0303030303030303ULL)\
377
               + (b&0x0303030303030303ULL)\
378
               + 0x0101010101010101ULL;\
379
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
380
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
381
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
382
            pixels+=line_size;\
383
            block +=line_size;\
384
        }\
385
}\
386
\
387
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
388
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
389
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
390
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
391
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
392
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
393
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
394

395
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
396
#else // 64 bit variant
397

    
398
#define PIXOP2(OPNAME, OP) \
399
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
400
    int i;\
401
    for(i=0; i<h; i++){\
402
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
403
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
404
        pixels+=line_size;\
405
        block +=line_size;\
406
    }\
407
}\
408
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
409
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
410
}\
411
\
412
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
413
                                                int src_stride1, int src_stride2, int h){\
414
    int i;\
415
    for(i=0; i<h; i++){\
416
        uint32_t a,b;\
417
        a= LD32(&src1[i*src_stride1  ]);\
418
        b= LD32(&src2[i*src_stride2  ]);\
419
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
420
        a= LD32(&src1[i*src_stride1+4]);\
421
        b= LD32(&src2[i*src_stride2+4]);\
422
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
423
    }\
424
}\
425
\
426
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
427
                                                int src_stride1, int src_stride2, int h){\
428
    int i;\
429
    for(i=0; i<h; i++){\
430
        uint32_t a,b;\
431
        a= LD32(&src1[i*src_stride1  ]);\
432
        b= LD32(&src2[i*src_stride2  ]);\
433
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
434
        a= LD32(&src1[i*src_stride1+4]);\
435
        b= LD32(&src2[i*src_stride2+4]);\
436
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
437
    }\
438
}\
439
\
440
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
441
                                                int src_stride1, int src_stride2, int h){\
442
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
443
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
444
}\
445
\
446
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
447
                                                int src_stride1, int src_stride2, int h){\
448
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
449
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
450
}\
451
\
452
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
453
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
454
}\
455
\
456
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
457
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
458
}\
459
\
460
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
461
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
462
}\
463
\
464
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
465
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
466
}\
467
\
468
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
469
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
470
    int i;\
471
    for(i=0; i<h; i++){\
472
        uint32_t a, b, c, d, l0, l1, h0, h1;\
473
        a= LD32(&src1[i*src_stride1]);\
474
        b= LD32(&src2[i*src_stride2]);\
475
        c= LD32(&src3[i*src_stride3]);\
476
        d= LD32(&src4[i*src_stride4]);\
477
        l0=  (a&0x03030303UL)\
478
           + (b&0x03030303UL)\
479
           + 0x02020202UL;\
480
        h0= ((a&0xFCFCFCFCUL)>>2)\
481
          + ((b&0xFCFCFCFCUL)>>2);\
482
        l1=  (c&0x03030303UL)\
483
           + (d&0x03030303UL);\
484
        h1= ((c&0xFCFCFCFCUL)>>2)\
485
          + ((d&0xFCFCFCFCUL)>>2);\
486
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
487
        a= LD32(&src1[i*src_stride1+4]);\
488
        b= LD32(&src2[i*src_stride2+4]);\
489
        c= LD32(&src3[i*src_stride3+4]);\
490
        d= LD32(&src4[i*src_stride4+4]);\
491
        l0=  (a&0x03030303UL)\
492
           + (b&0x03030303UL)\
493
           + 0x02020202UL;\
494
        h0= ((a&0xFCFCFCFCUL)>>2)\
495
          + ((b&0xFCFCFCFCUL)>>2);\
496
        l1=  (c&0x03030303UL)\
497
           + (d&0x03030303UL);\
498
        h1= ((c&0xFCFCFCFCUL)>>2)\
499
          + ((d&0xFCFCFCFCUL)>>2);\
500
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
501
    }\
502
}\
503
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
504
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
505
    int i;\
506
    for(i=0; i<h; i++){\
507
        uint32_t a, b, c, d, l0, l1, h0, h1;\
508
        a= LD32(&src1[i*src_stride1]);\
509
        b= LD32(&src2[i*src_stride2]);\
510
        c= LD32(&src3[i*src_stride3]);\
511
        d= LD32(&src4[i*src_stride4]);\
512
        l0=  (a&0x03030303UL)\
513
           + (b&0x03030303UL)\
514
           + 0x01010101UL;\
515
        h0= ((a&0xFCFCFCFCUL)>>2)\
516
          + ((b&0xFCFCFCFCUL)>>2);\
517
        l1=  (c&0x03030303UL)\
518
           + (d&0x03030303UL);\
519
        h1= ((c&0xFCFCFCFCUL)>>2)\
520
          + ((d&0xFCFCFCFCUL)>>2);\
521
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
522
        a= LD32(&src1[i*src_stride1+4]);\
523
        b= LD32(&src2[i*src_stride2+4]);\
524
        c= LD32(&src3[i*src_stride3+4]);\
525
        d= LD32(&src4[i*src_stride4+4]);\
526
        l0=  (a&0x03030303UL)\
527
           + (b&0x03030303UL)\
528
           + 0x01010101UL;\
529
        h0= ((a&0xFCFCFCFCUL)>>2)\
530
          + ((b&0xFCFCFCFCUL)>>2);\
531
        l1=  (c&0x03030303UL)\
532
           + (d&0x03030303UL);\
533
        h1= ((c&0xFCFCFCFCUL)>>2)\
534
          + ((d&0xFCFCFCFCUL)>>2);\
535
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
536
    }\
537
}\
538
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
539
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
540
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
541
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
542
}\
543
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
544
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
545
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
546
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
547
}\
548
\
549
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
550
{\
551
    int j;\
552
    for(j=0; j<2; j++){\
553
        int i;\
554
        const uint32_t a= LD32(pixels  );\
555
        const uint32_t b= LD32(pixels+1);\
556
        uint32_t l0=  (a&0x03030303UL)\
557
                    + (b&0x03030303UL)\
558
                    + 0x02020202UL;\
559
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
560
                   + ((b&0xFCFCFCFCUL)>>2);\
561
        uint32_t l1,h1;\
562
\
563
        pixels+=line_size;\
564
        for(i=0; i<h; i+=2){\
565
            uint32_t a= LD32(pixels  );\
566
            uint32_t b= LD32(pixels+1);\
567
            l1=  (a&0x03030303UL)\
568
               + (b&0x03030303UL);\
569
            h1= ((a&0xFCFCFCFCUL)>>2)\
570
              + ((b&0xFCFCFCFCUL)>>2);\
571
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
572
            pixels+=line_size;\
573
            block +=line_size;\
574
            a= LD32(pixels  );\
575
            b= LD32(pixels+1);\
576
            l0=  (a&0x03030303UL)\
577
               + (b&0x03030303UL)\
578
               + 0x02020202UL;\
579
            h0= ((a&0xFCFCFCFCUL)>>2)\
580
              + ((b&0xFCFCFCFCUL)>>2);\
581
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
582
            pixels+=line_size;\
583
            block +=line_size;\
584
        }\
585
        pixels+=4-line_size*(h+1);\
586
        block +=4-line_size*h;\
587
    }\
588
}\
589
\
590
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
591
{\
592
    int j;\
593
    for(j=0; j<2; j++){\
594
        int i;\
595
        const uint32_t a= LD32(pixels  );\
596
        const uint32_t b= LD32(pixels+1);\
597
        uint32_t l0=  (a&0x03030303UL)\
598
                    + (b&0x03030303UL)\
599
                    + 0x01010101UL;\
600
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
601
                   + ((b&0xFCFCFCFCUL)>>2);\
602
        uint32_t l1,h1;\
603
\
604
        pixels+=line_size;\
605
        for(i=0; i<h; i+=2){\
606
            uint32_t a= LD32(pixels  );\
607
            uint32_t b= LD32(pixels+1);\
608
            l1=  (a&0x03030303UL)\
609
               + (b&0x03030303UL);\
610
            h1= ((a&0xFCFCFCFCUL)>>2)\
611
              + ((b&0xFCFCFCFCUL)>>2);\
612
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
613
            pixels+=line_size;\
614
            block +=line_size;\
615
            a= LD32(pixels  );\
616
            b= LD32(pixels+1);\
617
            l0=  (a&0x03030303UL)\
618
               + (b&0x03030303UL)\
619
               + 0x01010101UL;\
620
            h0= ((a&0xFCFCFCFCUL)>>2)\
621
              + ((b&0xFCFCFCFCUL)>>2);\
622
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
623
            pixels+=line_size;\
624
            block +=line_size;\
625
        }\
626
        pixels+=4-line_size*(h+1);\
627
        block +=4-line_size*h;\
628
    }\
629
}\
630
\
631
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
632
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
633
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
634
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
635
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
636
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
637
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
638
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
639

    
640
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
641
#endif
642
#define op_put(a, b) a = b
643

    
644
PIXOP2(avg, op_avg)
645
PIXOP2(put, op_put)
646
#undef op_avg
647
#undef op_put
648

    
649
#define avg2(a,b) ((a+b+1)>>1)
650
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
651

    
652

    
653
static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
654
{
655
    const int A=(16-x16)*(16-y16);
656
    const int B=(   x16)*(16-y16);
657
    const int C=(16-x16)*(   y16);
658
    const int D=(   x16)*(   y16);
659
    int i;
660

    
661
    for(i=0; i<h; i++)
662
    {
663
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
664
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
665
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
666
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
667
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
668
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
669
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
670
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
671
        dst+= stride;
672
        src+= stride;
673
    }
674
}
675

    
676
static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy, 
677
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
678
{
679
    int y, vx, vy;
680
    const int s= 1<<shift;
681
    
682
    width--;
683
    height--;
684

    
685
    for(y=0; y<h; y++){
686
        int x;
687

    
688
        vx= ox;
689
        vy= oy;
690
        for(x=0; x<8; x++){ //XXX FIXME optimize
691
            int src_x, src_y, frac_x, frac_y, index;
692

    
693
            src_x= vx>>16;
694
            src_y= vy>>16;
695
            frac_x= src_x&(s-1);
696
            frac_y= src_y&(s-1);
697
            src_x>>=shift;
698
            src_y>>=shift;
699
  
700
            if((unsigned)src_x < width){
701
                if((unsigned)src_y < height){
702
                    index= src_x + src_y*stride;
703
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
704
                                           + src[index       +1]*   frac_x )*(s-frac_y)
705
                                        + (  src[index+stride  ]*(s-frac_x)
706
                                           + src[index+stride+1]*   frac_x )*   frac_y
707
                                        + r)>>(shift*2);
708
                }else{
709
                    index= src_x + clip(src_y, 0, height)*stride;                    
710
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
711
                                          + src[index       +1]*   frac_x )*s
712
                                        + r)>>(shift*2);
713
                }
714
            }else{
715
                if((unsigned)src_y < height){
716
                    index= clip(src_x, 0, width) + src_y*stride;                    
717
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
718
                                           + src[index+stride  ]*   frac_y )*s
719
                                        + r)>>(shift*2);
720
                }else{
721
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
722
                    dst[y*stride + x]=    src[index         ];
723
                }
724
            }
725
            
726
            vx+= dxx;
727
            vy+= dyx;
728
        }
729
        ox += dxy;
730
        oy += dyy;
731
    }
732
}
733

    
734
static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
735
{
736
    int i;
737
    for(i=0; i<h; i++)
738
    {
739
        ST32(dst   , LD32(src   ));
740
        ST32(dst+4 , LD32(src+4 ));
741
        ST32(dst+8 , LD32(src+8 ));
742
        ST32(dst+12, LD32(src+12));
743
        dst[16]= src[16];
744
        dst+=dstStride;
745
        src+=srcStride;
746
    }
747
}
748

    
749
static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
750
{
751
    int i;
752
    for(i=0; i<h; i++)
753
    {
754
        ST32(dst   , LD32(src   ));
755
        ST32(dst+4 , LD32(src+4 ));
756
        dst[8]= src[8];
757
        dst+=dstStride;
758
        src+=srcStride;
759
    }
760
}
761

    
762
#define QPEL_MC(r, OPNAME, RND, OP) \
763
static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
764
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
765
    int i;\
766
    for(i=0; i<h; i++)\
767
    {\
768
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
769
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
770
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
771
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
772
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
773
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
774
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
775
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
776
        dst+=dstStride;\
777
        src+=srcStride;\
778
    }\
779
}\
780
\
781
static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
782
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
783
    int i;\
784
    for(i=0; i<w; i++)\
785
    {\
786
        const int src0= src[0*srcStride];\
787
        const int src1= src[1*srcStride];\
788
        const int src2= src[2*srcStride];\
789
        const int src3= src[3*srcStride];\
790
        const int src4= src[4*srcStride];\
791
        const int src5= src[5*srcStride];\
792
        const int src6= src[6*srcStride];\
793
        const int src7= src[7*srcStride];\
794
        const int src8= src[8*srcStride];\
795
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
796
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
797
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
798
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
799
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
800
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
801
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
802
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
803
        dst++;\
804
        src++;\
805
    }\
806
}\
807
\
808
static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
809
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
810
    int i;\
811
    for(i=0; i<h; i++)\
812
    {\
813
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
814
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
815
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
816
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
817
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
818
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
819
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
820
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
821
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
822
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
823
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
824
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
825
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
826
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
827
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
828
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
829
        dst+=dstStride;\
830
        src+=srcStride;\
831
    }\
832
}\
833
\
834
static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
835
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
836
    int i;\
837
    for(i=0; i<w; i++)\
838
    {\
839
        const int src0= src[0*srcStride];\
840
        const int src1= src[1*srcStride];\
841
        const int src2= src[2*srcStride];\
842
        const int src3= src[3*srcStride];\
843
        const int src4= src[4*srcStride];\
844
        const int src5= src[5*srcStride];\
845
        const int src6= src[6*srcStride];\
846
        const int src7= src[7*srcStride];\
847
        const int src8= src[8*srcStride];\
848
        const int src9= src[9*srcStride];\
849
        const int src10= src[10*srcStride];\
850
        const int src11= src[11*srcStride];\
851
        const int src12= src[12*srcStride];\
852
        const int src13= src[13*srcStride];\
853
        const int src14= src[14*srcStride];\
854
        const int src15= src[15*srcStride];\
855
        const int src16= src[16*srcStride];\
856
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
857
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
858
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
859
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
860
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
861
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
862
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
863
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
864
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
865
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
866
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
867
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
868
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
869
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
870
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
871
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
872
        dst++;\
873
        src++;\
874
    }\
875
}\
876
\
877
static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
878
    OPNAME ## pixels8_c(dst, src, stride, 8);\
879
}\
880
\
881
static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
882
    UINT8 half[64];\
883
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
884
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
885
}\
886
\
887
static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
888
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
889
}\
890
\
891
static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
892
    UINT8 half[64];\
893
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
894
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
895
}\
896
\
897
static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
898
    UINT8 full[16*9];\
899
    UINT8 half[64];\
900
    copy_block9(full, src, 16, stride, 9);\
901
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
902
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
903
}\
904
\
905
static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
906
    UINT8 full[16*9];\
907
    copy_block9(full, src, 16, stride, 9);\
908
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
909
}\
910
\
911
static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
912
    UINT8 full[16*9];\
913
    UINT8 half[64];\
914
    copy_block9(full, src, 16, stride, 9);\
915
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
916
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
917
}\
918
static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
919
    UINT8 full[16*9];\
920
    UINT8 halfH[72];\
921
    UINT8 halfV[64];\
922
    UINT8 halfHV[64];\
923
    copy_block9(full, src, 16, stride, 9);\
924
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
925
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
926
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
927
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
928
}\
929
static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
930
    UINT8 full[16*9];\
931
    UINT8 halfH[72];\
932
    UINT8 halfV[64];\
933
    UINT8 halfHV[64];\
934
    copy_block9(full, src, 16, stride, 9);\
935
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
936
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
937
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
938
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
939
}\
940
static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
941
    UINT8 full[16*9];\
942
    UINT8 halfH[72];\
943
    UINT8 halfV[64];\
944
    UINT8 halfHV[64];\
945
    copy_block9(full, src, 16, stride, 9);\
946
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
947
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
948
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
949
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
950
}\
951
static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
952
    UINT8 full[16*9];\
953
    UINT8 halfH[72];\
954
    UINT8 halfV[64];\
955
    UINT8 halfHV[64];\
956
    copy_block9(full, src, 16, stride, 9);\
957
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
958
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
959
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
960
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
961
}\
962
static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
963
    UINT8 halfH[72];\
964
    UINT8 halfHV[64];\
965
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
966
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
967
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
968
}\
969
static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
970
    UINT8 halfH[72];\
971
    UINT8 halfHV[64];\
972
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
973
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
974
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
975
}\
976
static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
977
    UINT8 full[16*9];\
978
    UINT8 halfH[72];\
979
    UINT8 halfV[64];\
980
    UINT8 halfHV[64];\
981
    copy_block9(full, src, 16, stride, 9);\
982
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
983
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
984
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
985
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
986
}\
987
static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
988
    UINT8 full[16*9];\
989
    UINT8 halfH[72];\
990
    UINT8 halfV[64];\
991
    UINT8 halfHV[64];\
992
    copy_block9(full, src, 16, stride, 9);\
993
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
994
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
995
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
996
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
997
}\
998
static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
999
    UINT8 halfH[72];\
1000
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1001
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1002
}\
1003
static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1004
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1005
}\
1006
\
1007
static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1008
    UINT8 half[256];\
1009
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1010
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1011
}\
1012
\
1013
static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1014
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1015
}\
1016
\
1017
static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1018
    UINT8 half[256];\
1019
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1020
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1021
}\
1022
\
1023
static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1024
    UINT8 full[24*17];\
1025
    UINT8 half[256];\
1026
    copy_block17(full, src, 24, stride, 17);\
1027
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1028
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1029
}\
1030
\
1031
static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1032
    UINT8 full[24*17];\
1033
    copy_block17(full, src, 24, stride, 17);\
1034
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1035
}\
1036
\
1037
static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1038
    UINT8 full[24*17];\
1039
    UINT8 half[256];\
1040
    copy_block17(full, src, 24, stride, 17);\
1041
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1042
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1043
}\
1044
static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1045
    UINT8 full[24*17];\
1046
    UINT8 halfH[272];\
1047
    UINT8 halfV[256];\
1048
    UINT8 halfHV[256];\
1049
    copy_block17(full, src, 24, stride, 17);\
1050
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1051
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1052
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1053
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1054
}\
1055
static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1056
    UINT8 full[24*17];\
1057
    UINT8 halfH[272];\
1058
    UINT8 halfV[256];\
1059
    UINT8 halfHV[256];\
1060
    copy_block17(full, src, 24, stride, 17);\
1061
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1062
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1063
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1064
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1065
}\
1066
static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1067
    UINT8 full[24*17];\
1068
    UINT8 halfH[272];\
1069
    UINT8 halfV[256];\
1070
    UINT8 halfHV[256];\
1071
    copy_block17(full, src, 24, stride, 17);\
1072
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1073
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1074
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1075
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1076
}\
1077
static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1078
    UINT8 full[24*17];\
1079
    UINT8 halfH[272];\
1080
    UINT8 halfV[256];\
1081
    UINT8 halfHV[256];\
1082
    copy_block17(full, src, 24, stride, 17);\
1083
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1084
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1085
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1086
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1087
}\
1088
static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1089
    UINT8 halfH[272];\
1090
    UINT8 halfHV[256];\
1091
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1092
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1093
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1094
}\
1095
static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1096
    UINT8 halfH[272];\
1097
    UINT8 halfHV[256];\
1098
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1099
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1100
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1101
}\
1102
static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1103
    UINT8 full[24*17];\
1104
    UINT8 halfH[272];\
1105
    UINT8 halfV[256];\
1106
    UINT8 halfHV[256];\
1107
    copy_block17(full, src, 24, stride, 17);\
1108
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1109
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1110
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1111
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1112
}\
1113
static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1114
    UINT8 full[24*17];\
1115
    UINT8 halfH[272];\
1116
    UINT8 halfV[256];\
1117
    UINT8 halfHV[256];\
1118
    copy_block17(full, src, 24, stride, 17);\
1119
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1120
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1121
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1122
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1123
}\
1124
static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1125
    UINT8 halfH[272];\
1126
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1127
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1128
}
1129

    
1130
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1131
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1132
#define op_put(a, b) a = cm[((b) + 16)>>5]
1133
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1134

    
1135
QPEL_MC(0, put_       , _       , op_put)
1136
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1137
QPEL_MC(0, avg_       , _       , op_avg)
1138
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1139
#undef op_avg
1140
#undef op_avg_no_rnd
1141
#undef op_put
1142
#undef op_put_no_rnd
1143

    
1144
static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1145
{
1146
    int s, i;
1147

    
1148
    s = 0;
1149
    for(i=0;i<16;i++) {
1150
        s += abs(pix1[0] - pix2[0]);
1151
        s += abs(pix1[1] - pix2[1]);
1152
        s += abs(pix1[2] - pix2[2]);
1153
        s += abs(pix1[3] - pix2[3]);
1154
        s += abs(pix1[4] - pix2[4]);
1155
        s += abs(pix1[5] - pix2[5]);
1156
        s += abs(pix1[6] - pix2[6]);
1157
        s += abs(pix1[7] - pix2[7]);
1158
        s += abs(pix1[8] - pix2[8]);
1159
        s += abs(pix1[9] - pix2[9]);
1160
        s += abs(pix1[10] - pix2[10]);
1161
        s += abs(pix1[11] - pix2[11]);
1162
        s += abs(pix1[12] - pix2[12]);
1163
        s += abs(pix1[13] - pix2[13]);
1164
        s += abs(pix1[14] - pix2[14]);
1165
        s += abs(pix1[15] - pix2[15]);
1166
        pix1 += line_size;
1167
        pix2 += line_size;
1168
    }
1169
    return s;
1170
}
1171

    
1172
static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1173
{
1174
    int s, i;
1175

    
1176
    s = 0;
1177
    for(i=0;i<16;i++) {
1178
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1179
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1180
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1181
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1182
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1183
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1184
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1185
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1186
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1187
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1188
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1189
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1190
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1191
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1192
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1193
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1194
        pix1 += line_size;
1195
        pix2 += line_size;
1196
    }
1197
    return s;
1198
}
1199

    
1200
static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1201
{
1202
    int s, i;
1203
    UINT8 *pix3 = pix2 + line_size;
1204

    
1205
    s = 0;
1206
    for(i=0;i<16;i++) {
1207
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1208
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1209
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1210
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1211
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1212
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1213
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1214
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1215
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1216
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1217
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1218
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1219
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1220
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1221
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1222
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1223
        pix1 += line_size;
1224
        pix2 += line_size;
1225
        pix3 += line_size;
1226
    }
1227
    return s;
1228
}
1229

    
1230
static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1231
{
1232
    int s, i;
1233
    UINT8 *pix3 = pix2 + line_size;
1234

    
1235
    s = 0;
1236
    for(i=0;i<16;i++) {
1237
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1238
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1239
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1240
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1241
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1242
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1243
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1244
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1245
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1246
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1247
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1248
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1249
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1250
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1251
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1252
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1253
        pix1 += line_size;
1254
        pix2 += line_size;
1255
        pix3 += line_size;
1256
    }
1257
    return s;
1258
}
1259

    
1260
static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1261
{
1262
    int s, i;
1263

    
1264
    s = 0;
1265
    for(i=0;i<8;i++) {
1266
        s += abs(pix1[0] - pix2[0]);
1267
        s += abs(pix1[1] - pix2[1]);
1268
        s += abs(pix1[2] - pix2[2]);
1269
        s += abs(pix1[3] - pix2[3]);
1270
        s += abs(pix1[4] - pix2[4]);
1271
        s += abs(pix1[5] - pix2[5]);
1272
        s += abs(pix1[6] - pix2[6]);
1273
        s += abs(pix1[7] - pix2[7]);
1274
        pix1 += line_size;
1275
        pix2 += line_size;
1276
    }
1277
    return s;
1278
}
1279

    
1280
static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1281
{
1282
    int s, i;
1283

    
1284
    s = 0;
1285
    for(i=0;i<8;i++) {
1286
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1287
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1288
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1289
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1290
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1291
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1292
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1293
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1294
        pix1 += line_size;
1295
        pix2 += line_size;
1296
    }
1297
    return s;
1298
}
1299

    
1300
static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1301
{
1302
    int s, i;
1303
    UINT8 *pix3 = pix2 + line_size;
1304

    
1305
    s = 0;
1306
    for(i=0;i<8;i++) {
1307
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1308
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1309
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1310
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1311
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1312
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1313
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1314
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1315
        pix1 += line_size;
1316
        pix2 += line_size;
1317
        pix3 += line_size;
1318
    }
1319
    return s;
1320
}
1321

    
1322
static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1323
{
1324
    int s, i;
1325
    UINT8 *pix3 = pix2 + line_size;
1326

    
1327
    s = 0;
1328
    for(i=0;i<8;i++) {
1329
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1330
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1331
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1332
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1333
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1334
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1335
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1336
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1337
        pix1 += line_size;
1338
        pix2 += line_size;
1339
        pix3 += line_size;
1340
    }
1341
    return s;
1342
}
1343

    
1344
void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
1345
{
1346
    int i;
1347
    INT16 temp[64];
1348
    
1349
    if(last<=0) return;
1350
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1351

    
1352
    for(i=0; i<=last; i++){
1353
        const int j= scantable[i];
1354
        temp[j]= block[j];
1355
        block[j]=0;
1356
    }
1357
    
1358
    for(i=0; i<=last; i++){
1359
        const int j= scantable[i];
1360
        const int perm_j= permutation[j];
1361
        block[perm_j]= temp[j];
1362
    }
1363
}
1364

    
1365
static void clear_blocks_c(DCTELEM *blocks)
1366
{
1367
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1368
}
1369

    
1370
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1371
    int i;
1372
    for(i=0; i+7<w; i++){
1373
        dst[i+0] += src[i+0];
1374
        dst[i+1] += src[i+1];
1375
        dst[i+2] += src[i+2];
1376
        dst[i+3] += src[i+3];
1377
        dst[i+4] += src[i+4];
1378
        dst[i+5] += src[i+5];
1379
        dst[i+6] += src[i+6];
1380
        dst[i+7] += src[i+7];
1381
    }
1382
    for(; i<w; i++)
1383
        dst[i+0] += src[i+0];
1384
}
1385

    
1386
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1387
    int i;
1388
    for(i=0; i+7<w; i++){
1389
        dst[i+0] = src1[i+0]-src2[i+0];
1390
        dst[i+1] = src1[i+1]-src2[i+1];
1391
        dst[i+2] = src1[i+2]-src2[i+2];
1392
        dst[i+3] = src1[i+3]-src2[i+3];
1393
        dst[i+4] = src1[i+4]-src2[i+4];
1394
        dst[i+5] = src1[i+5]-src2[i+5];
1395
        dst[i+6] = src1[i+6]-src2[i+6];
1396
        dst[i+7] = src1[i+7]-src2[i+7];
1397
    }
1398
    for(; i<w; i++)
1399
        dst[i+0] = src1[i+0]-src2[i+0];
1400
}
1401

    
1402
void dsputil_init(DSPContext* c, unsigned mask)
1403
{
1404
    static int init_done = 0;
1405
    int i;
1406

    
1407
    if (!init_done) {
1408
        for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1409
        for(i=0;i<MAX_NEG_CROP;i++) {
1410
            cropTbl[i] = 0;
1411
            cropTbl[i + MAX_NEG_CROP + 256] = 255;
1412
        }
1413

    
1414
        for(i=0;i<512;i++) {
1415
            squareTbl[i] = (i - 256) * (i - 256);
1416
        }
1417

    
1418
        for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1419

    
1420
        init_done = 1;
1421
    }
1422

    
1423
    c->get_pixels = get_pixels_c;
1424
    c->diff_pixels = diff_pixels_c;
1425
    c->put_pixels_clamped = put_pixels_clamped_c;
1426
    c->add_pixels_clamped = add_pixels_clamped_c;
1427
    c->gmc1 = gmc1_c;
1428
    c->gmc = gmc_c;
1429
    c->clear_blocks = clear_blocks_c;
1430
    c->pix_sum = pix_sum_c;
1431
    c->pix_norm1 = pix_norm1_c;
1432
    c->pix_norm = pix_norm_c;
1433

    
1434
    /* TODO [0] 16  [1] 8 */
1435
    c->pix_abs16x16     = pix_abs16x16_c;
1436
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
1437
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
1438
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1439
    c->pix_abs8x8     = pix_abs8x8_c;
1440
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
1441
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
1442
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1443

    
1444
#define dspfunc(PFX, IDX, NUM) \
1445
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
1446
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
1447
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
1448
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
1449

    
1450
    dspfunc(put, 0, 16);
1451
    dspfunc(put_no_rnd, 0, 16);
1452
    dspfunc(put, 1, 8);
1453
    dspfunc(put_no_rnd, 1, 8);
1454

    
1455
    dspfunc(avg, 0, 16);
1456
    dspfunc(avg_no_rnd, 0, 16);
1457
    dspfunc(avg, 1, 8);
1458
    dspfunc(avg_no_rnd, 1, 8);
1459
#undef dspfunc
1460

    
1461
#define dspfunc(PFX, IDX, NUM) \
1462
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
1463
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
1464
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
1465
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
1466
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
1467
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
1468
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
1469
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
1470
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
1471
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
1472
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
1473
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
1474
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
1475
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
1476
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
1477
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
1478

    
1479
    dspfunc(put_qpel, 0, 16);
1480
    dspfunc(put_no_rnd_qpel, 0, 16);
1481

    
1482
    dspfunc(avg_qpel, 0, 16);
1483
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
1484

    
1485
    dspfunc(put_qpel, 1, 8);
1486
    dspfunc(put_no_rnd_qpel, 1, 8);
1487

    
1488
    dspfunc(avg_qpel, 1, 8);
1489
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
1490
#undef dspfunc
1491

    
1492
    c->add_bytes= add_bytes_c;
1493
    c->diff_bytes= diff_bytes_c;
1494

    
1495
#ifdef HAVE_MMX
1496
    dsputil_init_mmx(c, mask);
1497
    if (ff_bit_exact)
1498
    {
1499
        /* FIXME - AVCodec context should have flag for bitexact match */
1500
        /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1501
        dsputil_set_bit_exact_mmx(c, mask);
1502
    }
1503
#endif
1504
#ifdef ARCH_ARMV4L
1505
    dsputil_init_armv4l(c, mask);
1506
#endif
1507
#ifdef HAVE_MLIB
1508
    dsputil_init_mlib(c, mask);
1509
#endif
1510
#ifdef ARCH_ALPHA
1511
    dsputil_init_alpha(c, mask);
1512
#endif
1513
#ifdef ARCH_POWERPC
1514
    dsputil_init_ppc(c, mask);
1515
#endif
1516
#ifdef HAVE_MMI
1517
    dsputil_init_mmi(c, mask);
1518
#endif
1519

    
1520
}
1521

    
1522
/* remove any non bit exact operation (testing purpose) */
1523
void avcodec_set_bit_exact(void)
1524
{
1525
    ff_bit_exact=1;
1526
#ifdef HAVE_MMX
1527
// FIXME - better set_bit_exact
1528
//    dsputil_set_bit_exact_mmx();
1529
#endif
1530
}
1531

    
1532
void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1533
              int orig_linesize[3], int coded_linesize,
1534
              AVCodecContext *avctx)
1535
{
1536
    int quad, diff, x, y;
1537
    UINT8 *orig, *coded;
1538
    UINT32 *sq = squareTbl + 256;
1539
    
1540
    quad = 0;
1541
    diff = 0;
1542
    
1543
    /* Luminance */
1544
    orig = orig_image[0];
1545
    coded = coded_image[0];
1546
    
1547
    for (y=0;y<avctx->height;y++) {
1548
        for (x=0;x<avctx->width;x++) {
1549
            diff = *(orig + x) - *(coded + x);
1550
            quad += sq[diff];
1551
        }
1552
        orig += orig_linesize[0];
1553
        coded += coded_linesize;
1554
    }
1555
   
1556
    avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1557
    
1558
    if (avctx->psnr_y) {
1559
        avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1560
        avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1561
    } else
1562
        avctx->psnr_y = 99.99;
1563
}
1564