Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ d2975f8d

History | View | Annotate | Download (67.4 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23

    
24
void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
25
void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
26
void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
27
void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
28
void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
29
void (*clear_blocks)(DCTELEM *blocks);
30
int (*pix_sum)(UINT8 * pix, int line_size);
31
int (*pix_norm1)(UINT8 * pix, int line_size);
32

    
33
op_pixels_abs_func pix_abs16x16;
34
op_pixels_abs_func pix_abs16x16_x2;
35
op_pixels_abs_func pix_abs16x16_y2;
36
op_pixels_abs_func pix_abs16x16_xy2;
37

    
38
op_pixels_abs_func pix_abs8x8;
39
op_pixels_abs_func pix_abs8x8_x2;
40
op_pixels_abs_func pix_abs8x8_y2;
41
op_pixels_abs_func pix_abs8x8_xy2;
42

    
43
int ff_bit_exact=0;
44

    
45
UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
46
UINT32 squareTbl[512];
47

    
48
const UINT8 ff_zigzag_direct[64] = {
49
    0,   1,  8, 16,  9,  2,  3, 10,
50
    17, 24, 32, 25, 18, 11,  4,  5,
51
    12, 19, 26, 33, 40, 48, 41, 34,
52
    27, 20, 13,  6,  7, 14, 21, 28,
53
    35, 42, 49, 56, 57, 50, 43, 36,
54
    29, 22, 15, 23, 30, 37, 44, 51,
55
    58, 59, 52, 45, 38, 31, 39, 46,
56
    53, 60, 61, 54, 47, 55, 62, 63
57
};
58

    
59
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
60
UINT16 __align8 inv_zigzag_direct16[64];
61

    
62
const UINT8 ff_alternate_horizontal_scan[64] = {
63
    0,  1,   2,  3,  8,  9, 16, 17, 
64
    10, 11,  4,  5,  6,  7, 15, 14,
65
    13, 12, 19, 18, 24, 25, 32, 33, 
66
    26, 27, 20, 21, 22, 23, 28, 29,
67
    30, 31, 34, 35, 40, 41, 48, 49, 
68
    42, 43, 36, 37, 38, 39, 44, 45,
69
    46, 47, 50, 51, 56, 57, 58, 59, 
70
    52, 53, 54, 55, 60, 61, 62, 63,
71
};
72

    
73
const UINT8 ff_alternate_vertical_scan[64] = {
74
    0,  8,  16, 24,  1,  9,  2, 10, 
75
    17, 25, 32, 40, 48, 56, 57, 49,
76
    41, 33, 26, 18,  3, 11,  4, 12, 
77
    19, 27, 34, 42, 50, 58, 35, 43,
78
    51, 59, 20, 28,  5, 13,  6, 14, 
79
    21, 29, 36, 44, 52, 60, 37, 45,
80
    53, 61, 22, 30,  7, 15, 23, 31, 
81
    38, 46, 54, 62, 39, 47, 55, 63,
82
};
83

    
84
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
85
UINT32 inverse[256]={
86
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
87
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
88
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
89
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
90
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
91
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
92
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
93
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
94
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
95
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
96
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
97
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
98
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
99
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
100
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
101
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
102
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
103
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
104
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
105
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
106
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
107
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
108
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
109
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
110
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
111
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
112
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
113
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
114
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
115
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
116
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
117
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
118
};
119

    
120
int pix_sum_c(UINT8 * pix, int line_size)
121
{
122
    int s, i, j;
123

    
124
    s = 0;
125
    for (i = 0; i < 16; i++) {
126
        for (j = 0; j < 16; j += 8) {
127
            s += pix[0];
128
            s += pix[1];
129
            s += pix[2];
130
            s += pix[3];
131
            s += pix[4];
132
            s += pix[5];
133
            s += pix[6];
134
            s += pix[7];
135
            pix += 8;
136
        }
137
        pix += line_size - 16;
138
    }
139
    return s;
140
}
141

    
142
int pix_norm1_c(UINT8 * pix, int line_size)
143
{
144
    int s, i, j;
145
    UINT32 *sq = squareTbl + 256;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
            s += sq[pix[0]];
151
            s += sq[pix[1]];
152
            s += sq[pix[2]];
153
            s += sq[pix[3]];
154
            s += sq[pix[4]];
155
            s += sq[pix[5]];
156
            s += sq[pix[6]];
157
            s += sq[pix[7]];
158
            pix += 8;
159
        }
160
        pix += line_size - 16;
161
    }
162
    return s;
163
}
164

    
165

    
166
void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
167
{
168
    int i;
169

    
170
    /* read the pixels */
171
    for(i=0;i<8;i++) {
172
        block[0] = pixels[0];
173
        block[1] = pixels[1];
174
        block[2] = pixels[2];
175
        block[3] = pixels[3];
176
        block[4] = pixels[4];
177
        block[5] = pixels[5];
178
        block[6] = pixels[6];
179
        block[7] = pixels[7];
180
        pixels += line_size;
181
        block += 8;
182
    }
183
}
184

    
185
void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
186
                   int stride){
187
    int i;
188

    
189
    /* read the pixels */
190
    for(i=0;i<8;i++) {
191
        block[0] = s1[0] - s2[0];
192
        block[1] = s1[1] - s2[1];
193
        block[2] = s1[2] - s2[2];
194
        block[3] = s1[3] - s2[3];
195
        block[4] = s1[4] - s2[4];
196
        block[5] = s1[5] - s2[5];
197
        block[6] = s1[6] - s2[6];
198
        block[7] = s1[7] - s2[7];
199
        s1 += stride;
200
        s2 += stride;
201
        block += 8;
202
    }
203
}
204

    
205

    
206
void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
207
                          int line_size)
208
{
209
    int i;
210
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
211
    
212
    /* read the pixels */
213
    for(i=0;i<8;i++) {
214
        pixels[0] = cm[block[0]];
215
        pixels[1] = cm[block[1]];
216
        pixels[2] = cm[block[2]];
217
        pixels[3] = cm[block[3]];
218
        pixels[4] = cm[block[4]];
219
        pixels[5] = cm[block[5]];
220
        pixels[6] = cm[block[6]];
221
        pixels[7] = cm[block[7]];
222

    
223
        pixels += line_size;
224
        block += 8;
225
    }
226
}
227

    
228
void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
229
                          int line_size)
230
{
231
    int i;
232
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
233
    
234
    /* read the pixels */
235
    for(i=0;i<8;i++) {
236
        pixels[0] = cm[pixels[0] + block[0]];
237
        pixels[1] = cm[pixels[1] + block[1]];
238
        pixels[2] = cm[pixels[2] + block[2]];
239
        pixels[3] = cm[pixels[3] + block[3]];
240
        pixels[4] = cm[pixels[4] + block[4]];
241
        pixels[5] = cm[pixels[5] + block[5]];
242
        pixels[6] = cm[pixels[6] + block[6]];
243
        pixels[7] = cm[pixels[7] + block[7]];
244
        pixels += line_size;
245
        block += 8;
246
    }
247
}
248
#if 0
249

250
#define PIXOP2(OPNAME, OP) \
251
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
252
{\
253
    int i;\
254
    for(i=0; i<h; i++){\
255
        OP(*((uint64_t*)block), LD64(pixels));\
256
        pixels+=line_size;\
257
        block +=line_size;\
258
    }\
259
}\
260
\
261
static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
262
{\
263
    int i;\
264
    for(i=0; i<h; i++){\
265
        const uint64_t a= LD64(pixels  );\
266
        const uint64_t b= LD64(pixels+1);\
267
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
268
        pixels+=line_size;\
269
        block +=line_size;\
270
    }\
271
}\
272
\
273
static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
274
{\
275
    int i;\
276
    for(i=0; i<h; i++){\
277
        const uint64_t a= LD64(pixels  );\
278
        const uint64_t b= LD64(pixels+1);\
279
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
280
        pixels+=line_size;\
281
        block +=line_size;\
282
    }\
283
}\
284
\
285
static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
286
{\
287
    int i;\
288
    for(i=0; i<h; i++){\
289
        const uint64_t a= LD64(pixels          );\
290
        const uint64_t b= LD64(pixels+line_size);\
291
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
292
        pixels+=line_size;\
293
        block +=line_size;\
294
    }\
295
}\
296
\
297
static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
298
{\
299
    int i;\
300
    for(i=0; i<h; i++){\
301
        const uint64_t a= LD64(pixels          );\
302
        const uint64_t b= LD64(pixels+line_size);\
303
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
304
        pixels+=line_size;\
305
        block +=line_size;\
306
    }\
307
}\
308
\
309
static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
310
{\
311
        int i;\
312
        const uint64_t a= LD64(pixels  );\
313
        const uint64_t b= LD64(pixels+1);\
314
        uint64_t l0=  (a&0x0303030303030303ULL)\
315
                    + (b&0x0303030303030303ULL)\
316
                    + 0x0202020202020202ULL;\
317
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
318
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
319
        uint64_t l1,h1;\
320
\
321
        pixels+=line_size;\
322
        for(i=0; i<h; i+=2){\
323
            uint64_t a= LD64(pixels  );\
324
            uint64_t b= LD64(pixels+1);\
325
            l1=  (a&0x0303030303030303ULL)\
326
               + (b&0x0303030303030303ULL);\
327
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
328
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
329
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
330
            pixels+=line_size;\
331
            block +=line_size;\
332
            a= LD64(pixels  );\
333
            b= LD64(pixels+1);\
334
            l0=  (a&0x0303030303030303ULL)\
335
               + (b&0x0303030303030303ULL)\
336
               + 0x0202020202020202ULL;\
337
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
338
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
339
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
340
            pixels+=line_size;\
341
            block +=line_size;\
342
        }\
343
}\
344
\
345
static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
346
{\
347
        int i;\
348
        const uint64_t a= LD64(pixels  );\
349
        const uint64_t b= LD64(pixels+1);\
350
        uint64_t l0=  (a&0x0303030303030303ULL)\
351
                    + (b&0x0303030303030303ULL)\
352
                    + 0x0101010101010101ULL;\
353
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
354
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
355
        uint64_t l1,h1;\
356
\
357
        pixels+=line_size;\
358
        for(i=0; i<h; i+=2){\
359
            uint64_t a= LD64(pixels  );\
360
            uint64_t b= LD64(pixels+1);\
361
            l1=  (a&0x0303030303030303ULL)\
362
               + (b&0x0303030303030303ULL);\
363
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
364
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
365
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
366
            pixels+=line_size;\
367
            block +=line_size;\
368
            a= LD64(pixels  );\
369
            b= LD64(pixels+1);\
370
            l0=  (a&0x0303030303030303ULL)\
371
               + (b&0x0303030303030303ULL)\
372
               + 0x0101010101010101ULL;\
373
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
374
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
375
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
376
            pixels+=line_size;\
377
            block +=line_size;\
378
        }\
379
}\
380
\
381
CALL_2X_PIXELS(OPNAME ## _pixels16    , OPNAME ## _pixels    , 8)\
382
CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
383
CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
384
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
385
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
386
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
387
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
388
\
389
void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
390
    {\
391
        OPNAME ## _pixels,\
392
        OPNAME ## _pixels_x2,\
393
        OPNAME ## _pixels_y2,\
394
        OPNAME ## _pixels_xy2},\
395
    {\
396
        OPNAME ## _pixels16,\
397
        OPNAME ## _pixels16_x2,\
398
        OPNAME ## _pixels16_y2,\
399
        OPNAME ## _pixels16_xy2}\
400
};\
401
\
402
void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
403
    {\
404
        OPNAME ## _pixels,\
405
        OPNAME ## _no_rnd_pixels_x2,\
406
        OPNAME ## _no_rnd_pixels_y2,\
407
        OPNAME ## _no_rnd_pixels_xy2},\
408
    {\
409
        OPNAME ## _pixels16,\
410
        OPNAME ## _no_rnd_pixels16_x2,\
411
        OPNAME ## _no_rnd_pixels16_y2,\
412
        OPNAME ## _no_rnd_pixels16_xy2}\
413
};
414

415
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
416
#else // 64 bit variant
417

    
418
#define PIXOP2(OPNAME, OP) \
419
static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
420
    int i;\
421
    for(i=0; i<h; i++){\
422
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
423
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
424
        pixels+=line_size;\
425
        block +=line_size;\
426
    }\
427
}\
428
static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
429
    OPNAME ## _pixels8(block, pixels, line_size, h);\
430
}\
431
\
432
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
433
                                                int src_stride1, int src_stride2, int h){\
434
    int i;\
435
    for(i=0; i<h; i++){\
436
        uint32_t a,b;\
437
        a= LD32(&src1[i*src_stride1  ]);\
438
        b= LD32(&src2[i*src_stride2  ]);\
439
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
440
        a= LD32(&src1[i*src_stride1+4]);\
441
        b= LD32(&src2[i*src_stride2+4]);\
442
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
443
    }\
444
}\
445
\
446
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
447
                                                int src_stride1, int src_stride2, int h){\
448
    int i;\
449
    for(i=0; i<h; i++){\
450
        uint32_t a,b;\
451
        a= LD32(&src1[i*src_stride1  ]);\
452
        b= LD32(&src2[i*src_stride2  ]);\
453
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
454
        a= LD32(&src1[i*src_stride1+4]);\
455
        b= LD32(&src2[i*src_stride2+4]);\
456
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
457
    }\
458
}\
459
\
460
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
461
                                                int src_stride1, int src_stride2, int h){\
462
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
463
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
464
}\
465
\
466
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
467
                                                int src_stride1, int src_stride2, int h){\
468
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
469
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
470
}\
471
\
472
static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
473
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
474
}\
475
\
476
static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
477
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
478
}\
479
\
480
static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
481
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
482
}\
483
\
484
static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
485
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
486
}\
487
\
488
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
489
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
490
    int i;\
491
    for(i=0; i<h; i++){\
492
        uint32_t a, b, c, d, l0, l1, h0, h1;\
493
        a= LD32(&src1[i*src_stride1]);\
494
        b= LD32(&src2[i*src_stride2]);\
495
        c= LD32(&src3[i*src_stride3]);\
496
        d= LD32(&src4[i*src_stride4]);\
497
        l0=  (a&0x03030303UL)\
498
           + (b&0x03030303UL)\
499
           + 0x02020202UL;\
500
        h0= ((a&0xFCFCFCFCUL)>>2)\
501
          + ((b&0xFCFCFCFCUL)>>2);\
502
        l1=  (c&0x03030303UL)\
503
           + (d&0x03030303UL);\
504
        h1= ((c&0xFCFCFCFCUL)>>2)\
505
          + ((d&0xFCFCFCFCUL)>>2);\
506
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
507
        a= LD32(&src1[i*src_stride1+4]);\
508
        b= LD32(&src2[i*src_stride2+4]);\
509
        c= LD32(&src3[i*src_stride3+4]);\
510
        d= LD32(&src4[i*src_stride4+4]);\
511
        l0=  (a&0x03030303UL)\
512
           + (b&0x03030303UL)\
513
           + 0x02020202UL;\
514
        h0= ((a&0xFCFCFCFCUL)>>2)\
515
          + ((b&0xFCFCFCFCUL)>>2);\
516
        l1=  (c&0x03030303UL)\
517
           + (d&0x03030303UL);\
518
        h1= ((c&0xFCFCFCFCUL)>>2)\
519
          + ((d&0xFCFCFCFCUL)>>2);\
520
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
521
    }\
522
}\
523
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
524
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
525
    int i;\
526
    for(i=0; i<h; i++){\
527
        uint32_t a, b, c, d, l0, l1, h0, h1;\
528
        a= LD32(&src1[i*src_stride1]);\
529
        b= LD32(&src2[i*src_stride2]);\
530
        c= LD32(&src3[i*src_stride3]);\
531
        d= LD32(&src4[i*src_stride4]);\
532
        l0=  (a&0x03030303UL)\
533
           + (b&0x03030303UL)\
534
           + 0x01010101UL;\
535
        h0= ((a&0xFCFCFCFCUL)>>2)\
536
          + ((b&0xFCFCFCFCUL)>>2);\
537
        l1=  (c&0x03030303UL)\
538
           + (d&0x03030303UL);\
539
        h1= ((c&0xFCFCFCFCUL)>>2)\
540
          + ((d&0xFCFCFCFCUL)>>2);\
541
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
542
        a= LD32(&src1[i*src_stride1+4]);\
543
        b= LD32(&src2[i*src_stride2+4]);\
544
        c= LD32(&src3[i*src_stride3+4]);\
545
        d= LD32(&src4[i*src_stride4+4]);\
546
        l0=  (a&0x03030303UL)\
547
           + (b&0x03030303UL)\
548
           + 0x01010101UL;\
549
        h0= ((a&0xFCFCFCFCUL)>>2)\
550
          + ((b&0xFCFCFCFCUL)>>2);\
551
        l1=  (c&0x03030303UL)\
552
           + (d&0x03030303UL);\
553
        h1= ((c&0xFCFCFCFCUL)>>2)\
554
          + ((d&0xFCFCFCFCUL)>>2);\
555
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
556
    }\
557
}\
558
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
559
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
560
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
561
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
562
}\
563
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
564
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
565
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
566
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
567
}\
568
\
569
static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
570
{\
571
    int j;\
572
    for(j=0; j<2; j++){\
573
        int i;\
574
        const uint32_t a= LD32(pixels  );\
575
        const uint32_t b= LD32(pixels+1);\
576
        uint32_t l0=  (a&0x03030303UL)\
577
                    + (b&0x03030303UL)\
578
                    + 0x02020202UL;\
579
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
580
                   + ((b&0xFCFCFCFCUL)>>2);\
581
        uint32_t l1,h1;\
582
\
583
        pixels+=line_size;\
584
        for(i=0; i<h; i+=2){\
585
            uint32_t a= LD32(pixels  );\
586
            uint32_t b= LD32(pixels+1);\
587
            l1=  (a&0x03030303UL)\
588
               + (b&0x03030303UL);\
589
            h1= ((a&0xFCFCFCFCUL)>>2)\
590
              + ((b&0xFCFCFCFCUL)>>2);\
591
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
592
            pixels+=line_size;\
593
            block +=line_size;\
594
            a= LD32(pixels  );\
595
            b= LD32(pixels+1);\
596
            l0=  (a&0x03030303UL)\
597
               + (b&0x03030303UL)\
598
               + 0x02020202UL;\
599
            h0= ((a&0xFCFCFCFCUL)>>2)\
600
              + ((b&0xFCFCFCFCUL)>>2);\
601
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
602
            pixels+=line_size;\
603
            block +=line_size;\
604
        }\
605
        pixels+=4-line_size*(h+1);\
606
        block +=4-line_size*h;\
607
    }\
608
}\
609
\
610
static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
611
{\
612
    int j;\
613
    for(j=0; j<2; j++){\
614
        int i;\
615
        const uint32_t a= LD32(pixels  );\
616
        const uint32_t b= LD32(pixels+1);\
617
        uint32_t l0=  (a&0x03030303UL)\
618
                    + (b&0x03030303UL)\
619
                    + 0x01010101UL;\
620
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
621
                   + ((b&0xFCFCFCFCUL)>>2);\
622
        uint32_t l1,h1;\
623
\
624
        pixels+=line_size;\
625
        for(i=0; i<h; i+=2){\
626
            uint32_t a= LD32(pixels  );\
627
            uint32_t b= LD32(pixels+1);\
628
            l1=  (a&0x03030303UL)\
629
               + (b&0x03030303UL);\
630
            h1= ((a&0xFCFCFCFCUL)>>2)\
631
              + ((b&0xFCFCFCFCUL)>>2);\
632
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
633
            pixels+=line_size;\
634
            block +=line_size;\
635
            a= LD32(pixels  );\
636
            b= LD32(pixels+1);\
637
            l0=  (a&0x03030303UL)\
638
               + (b&0x03030303UL)\
639
               + 0x01010101UL;\
640
            h0= ((a&0xFCFCFCFCUL)>>2)\
641
              + ((b&0xFCFCFCFCUL)>>2);\
642
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
643
            pixels+=line_size;\
644
            block +=line_size;\
645
        }\
646
        pixels+=4-line_size*(h+1);\
647
        block +=4-line_size*h;\
648
    }\
649
}\
650
\
651
CALL_2X_PIXELS(OPNAME ## _pixels16    , OPNAME ## _pixels8    , 8)\
652
CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
653
CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
654
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
655
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16    , OPNAME ## _pixels8    , 8)\
656
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
657
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
658
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
659
\
660
void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
661
    {\
662
        OPNAME ## _pixels16,\
663
        OPNAME ## _pixels16_x2,\
664
        OPNAME ## _pixels16_y2,\
665
        OPNAME ## _pixels16_xy2},\
666
    {\
667
        OPNAME ## _pixels8,\
668
        OPNAME ## _pixels8_x2,\
669
        OPNAME ## _pixels8_y2,\
670
        OPNAME ## _pixels8_xy2},\
671
};\
672
\
673
void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
674
    {\
675
        OPNAME ## _pixels16,\
676
        OPNAME ## _no_rnd_pixels16_x2,\
677
        OPNAME ## _no_rnd_pixels16_y2,\
678
        OPNAME ## _no_rnd_pixels16_xy2},\
679
    {\
680
        OPNAME ## _pixels8,\
681
        OPNAME ## _no_rnd_pixels8_x2,\
682
        OPNAME ## _no_rnd_pixels8_y2,\
683
        OPNAME ## _no_rnd_pixels8_xy2},\
684
};
685

    
686
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
687
#endif
688
#define op_put(a, b) a = b
689

    
690
PIXOP2(avg, op_avg)
691
PIXOP2(put, op_put)
692
#undef op_avg
693
#undef op_put
694

    
695
#if 0
696
/* FIXME this stuff could be removed as its ot really used anymore */
697
#define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
698
                                                                                         \
699
static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
700
{                                                                                        \
701
    BTYPE *p;                                                                            \
702
    const UINT8 *pix;                                                                    \
703
                                                                                         \
704
    p = block;                                                                           \
705
    pix = pixels;                                                                        \
706
    do {                                                                                 \
707
        OP(p[0], pix[0]);                                                                  \
708
        OP(p[1], pix[1]);                                                                  \
709
        OP(p[2], pix[2]);                                                                  \
710
        OP(p[3], pix[3]);                                                                  \
711
        OP(p[4], pix[4]);                                                                  \
712
        OP(p[5], pix[5]);                                                                  \
713
        OP(p[6], pix[6]);                                                                  \
714
        OP(p[7], pix[7]);                                                                  \
715
        pix += line_size;                                                                \
716
        p += INCR;                                                                       \
717
    } while (--h);;                                                                       \
718
}                                                                                        \
719
                                                                                         \
720
static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
721
{                                                                                        \
722
    BTYPE *p;                                                                          \
723
    const UINT8 *pix;                                                                    \
724
                                                                                         \
725
    p = block;                                                                           \
726
    pix = pixels;                                                                        \
727
    do {                                                                   \
728
        OP(p[0], avg2(pix[0], pix[1]));                                                    \
729
        OP(p[1], avg2(pix[1], pix[2]));                                                    \
730
        OP(p[2], avg2(pix[2], pix[3]));                                                    \
731
        OP(p[3], avg2(pix[3], pix[4]));                                                    \
732
        OP(p[4], avg2(pix[4], pix[5]));                                                    \
733
        OP(p[5], avg2(pix[5], pix[6]));                                                    \
734
        OP(p[6], avg2(pix[6], pix[7]));                                                    \
735
        OP(p[7], avg2(pix[7], pix[8]));                                                    \
736
        pix += line_size;                                                                \
737
        p += INCR;                                                                       \
738
    } while (--h);                                                                        \
739
}                                                                                        \
740
                                                                                         \
741
static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
742
{                                                                                        \
743
    BTYPE *p;                                                                          \
744
    const UINT8 *pix;                                                                    \
745
    const UINT8 *pix1;                                                                   \
746
                                                                                         \
747
    p = block;                                                                           \
748
    pix = pixels;                                                                        \
749
    pix1 = pixels + line_size;                                                           \
750
    do {                                                                                 \
751
        OP(p[0], avg2(pix[0], pix1[0]));                                                   \
752
        OP(p[1], avg2(pix[1], pix1[1]));                                                   \
753
        OP(p[2], avg2(pix[2], pix1[2]));                                                   \
754
        OP(p[3], avg2(pix[3], pix1[3]));                                                   \
755
        OP(p[4], avg2(pix[4], pix1[4]));                                                   \
756
        OP(p[5], avg2(pix[5], pix1[5]));                                                   \
757
        OP(p[6], avg2(pix[6], pix1[6]));                                                   \
758
        OP(p[7], avg2(pix[7], pix1[7]));                                                   \
759
        pix += line_size;                                                                \
760
        pix1 += line_size;                                                               \
761
        p += INCR;                                                                       \
762
    } while(--h);                                                                         \
763
}                                                                                        \
764
                                                                                         \
765
static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
766
{                                                                                        \
767
    BTYPE *p;                                                                          \
768
    const UINT8 *pix;                                                                    \
769
    const UINT8 *pix1;                                                                   \
770
                                                                                         \
771
    p = block;                                                                           \
772
    pix = pixels;                                                                        \
773
    pix1 = pixels + line_size;                                                           \
774
    do {                                                                   \
775
        OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
776
        OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
777
        OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
778
        OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
779
        OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
780
        OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
781
        OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
782
        OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
783
        pix += line_size;                                                                \
784
        pix1 += line_size;                                                               \
785
        p += INCR;                                                                       \
786
    } while(--h);                                                                         \
787
}                                                                                        \
788
                                                                                         \
789
void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
790
    OPNAME ## _pixels,                                                                   \
791
    OPNAME ## _pixels_x2,                                                                \
792
    OPNAME ## _pixels_y2,                                                                \
793
    OPNAME ## _pixels_xy2,                                                               \
794
};
795

796
/* rounding primitives */
797
#define avg2(a,b) ((a+b+1)>>1)
798
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
799

800
#define op_avg(a, b) a = avg2(a, b)
801
#define op_sub(a, b) a -= b
802
#define op_put(a, b) a = b
803

804
PIXOP(DCTELEM, sub, op_sub, 8)
805
PIXOP(uint8_t, avg, op_avg, line_size)
806
PIXOP(uint8_t, put, op_put, line_size)
807

808
/* not rounding primitives */
809
#undef avg2
810
#undef avg4
811
#define avg2(a,b) ((a+b)>>1)
812
#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
813

814
PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
815
PIXOP(uint8_t, put_no_rnd, op_put, line_size)
816
/* motion estimation */
817

818
#undef avg2
819
#undef avg4
820
#endif
821

    
822
#define avg2(a,b) ((a+b+1)>>1)
823
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
824

    
825
static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
826
{
827
    const int A=(16-x16)*(16-y16);
828
    const int B=(   x16)*(16-y16);
829
    const int C=(16-x16)*(   y16);
830
    const int D=(   x16)*(   y16);
831
    int i;
832
    rounder= 128 - rounder;
833

    
834
    for(i=0; i<h; i++)
835
    {
836
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
837
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
838
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
839
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
840
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
841
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
842
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
843
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
844
        dst+= stride;
845
        src+= stride;
846
    }
847
}
848

    
849
static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
850
{
851
    int i;
852
    for(i=0; i<h; i++)
853
    {
854
        ST32(dst   , LD32(src   ));
855
        ST32(dst+4 , LD32(src+4 ));
856
        ST32(dst+8 , LD32(src+8 ));
857
        ST32(dst+12, LD32(src+12));
858
        dst[16]= src[16];
859
        dst+=dstStride;
860
        src+=srcStride;
861
    }
862
}
863

    
864
static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
865
{
866
    int i;
867
    for(i=0; i<h; i++)
868
    {
869
        ST32(dst   , LD32(src   ));
870
        ST32(dst+4 , LD32(src+4 ));
871
        dst[8]= src[8];
872
        dst+=dstStride;
873
        src+=srcStride;
874
    }
875
}
876

    
877
#define QPEL_MC(r, OPNAME, RND, OP) \
878
static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
879
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
880
    int i;\
881
    for(i=0; i<h; i++)\
882
    {\
883
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
884
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
885
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
886
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
887
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
888
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
889
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
890
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
891
        dst+=dstStride;\
892
        src+=srcStride;\
893
    }\
894
}\
895
\
896
static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
897
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
898
    int i;\
899
    for(i=0; i<w; i++)\
900
    {\
901
        const int src0= src[0*srcStride];\
902
        const int src1= src[1*srcStride];\
903
        const int src2= src[2*srcStride];\
904
        const int src3= src[3*srcStride];\
905
        const int src4= src[4*srcStride];\
906
        const int src5= src[5*srcStride];\
907
        const int src6= src[6*srcStride];\
908
        const int src7= src[7*srcStride];\
909
        const int src8= src[8*srcStride];\
910
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
911
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
912
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
913
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
914
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
915
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
916
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
917
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
918
        dst++;\
919
        src++;\
920
    }\
921
}\
922
\
923
static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
924
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
925
    int i;\
926
    for(i=0; i<h; i++)\
927
    {\
928
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
929
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
930
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
931
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
932
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
933
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
934
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
935
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
936
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
937
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
938
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
939
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
940
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
941
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
942
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
943
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
944
        dst+=dstStride;\
945
        src+=srcStride;\
946
    }\
947
}\
948
\
949
static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
950
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
951
    int i;\
952
    for(i=0; i<w; i++)\
953
    {\
954
        const int src0= src[0*srcStride];\
955
        const int src1= src[1*srcStride];\
956
        const int src2= src[2*srcStride];\
957
        const int src3= src[3*srcStride];\
958
        const int src4= src[4*srcStride];\
959
        const int src5= src[5*srcStride];\
960
        const int src6= src[6*srcStride];\
961
        const int src7= src[7*srcStride];\
962
        const int src8= src[8*srcStride];\
963
        const int src9= src[9*srcStride];\
964
        const int src10= src[10*srcStride];\
965
        const int src11= src[11*srcStride];\
966
        const int src12= src[12*srcStride];\
967
        const int src13= src[13*srcStride];\
968
        const int src14= src[14*srcStride];\
969
        const int src15= src[15*srcStride];\
970
        const int src16= src[16*srcStride];\
971
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
972
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
973
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
974
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
975
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
976
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
977
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
978
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
979
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
980
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
981
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
982
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
983
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
984
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
985
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
986
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
987
        dst++;\
988
        src++;\
989
    }\
990
}\
991
\
992
static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
993
    OPNAME ## pixels8(dst, src, stride, 8);\
994
}\
995
\
996
static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
997
    UINT8 half[64];\
998
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
999
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1000
}\
1001
\
1002
static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1003
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1004
}\
1005
\
1006
static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1007
    UINT8 half[64];\
1008
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1009
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1010
}\
1011
\
1012
static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1013
    UINT8 full[16*9];\
1014
    UINT8 half[64];\
1015
    copy_block9(full, src, 16, stride, 9);\
1016
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1017
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1018
}\
1019
\
1020
static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1021
    UINT8 full[16*9];\
1022
    copy_block9(full, src, 16, stride, 9);\
1023
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
1024
}\
1025
\
1026
static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1027
    UINT8 full[16*9];\
1028
    UINT8 half[64];\
1029
    copy_block9(full, src, 16, stride, 9);\
1030
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1031
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1032
}\
1033
static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1034
    UINT8 full[16*9];\
1035
    UINT8 halfH[72];\
1036
    UINT8 halfV[64];\
1037
    UINT8 halfHV[64];\
1038
    copy_block9(full, src, 16, stride, 9);\
1039
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1040
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1041
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1042
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1043
}\
1044
static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1045
    UINT8 full[16*9];\
1046
    UINT8 halfH[72];\
1047
    UINT8 halfV[64];\
1048
    UINT8 halfHV[64];\
1049
    copy_block9(full, src, 16, stride, 9);\
1050
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1051
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1052
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1053
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1054
}\
1055
static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1056
    UINT8 full[16*9];\
1057
    UINT8 halfH[72];\
1058
    UINT8 halfV[64];\
1059
    UINT8 halfHV[64];\
1060
    copy_block9(full, src, 16, stride, 9);\
1061
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1063
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1064
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1065
}\
1066
static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1067
    UINT8 full[16*9];\
1068
    UINT8 halfH[72];\
1069
    UINT8 halfV[64];\
1070
    UINT8 halfHV[64];\
1071
    copy_block9(full, src, 16, stride, 9);\
1072
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1073
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1074
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1075
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1076
}\
1077
static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1078
    UINT8 halfH[72];\
1079
    UINT8 halfHV[64];\
1080
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1081
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1082
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1083
}\
1084
static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1085
    UINT8 halfH[72];\
1086
    UINT8 halfHV[64];\
1087
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1088
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1089
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1090
}\
1091
static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1092
    UINT8 full[16*9];\
1093
    UINT8 halfH[72];\
1094
    UINT8 halfV[64];\
1095
    UINT8 halfHV[64];\
1096
    copy_block9(full, src, 16, stride, 9);\
1097
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1098
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1099
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1100
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1101
}\
1102
static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1103
    UINT8 full[16*9];\
1104
    UINT8 halfH[72];\
1105
    UINT8 halfV[64];\
1106
    UINT8 halfHV[64];\
1107
    copy_block9(full, src, 16, stride, 9);\
1108
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1109
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1110
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1111
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1112
}\
1113
static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1114
    UINT8 halfH[72];\
1115
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1117
}\
1118
static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1119
    OPNAME ## pixels16(dst, src, stride, 16);\
1120
}\
1121
\
1122
static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1123
    UINT8 half[256];\
1124
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1125
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1126
}\
1127
\
1128
static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1129
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1130
}\
1131
\
1132
static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1133
    UINT8 half[256];\
1134
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1136
}\
1137
\
1138
static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1139
    UINT8 full[24*17];\
1140
    UINT8 half[256];\
1141
    copy_block17(full, src, 24, stride, 17);\
1142
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1143
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1144
}\
1145
\
1146
static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1147
    UINT8 full[24*17];\
1148
    copy_block17(full, src, 24, stride, 17);\
1149
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1150
}\
1151
\
1152
static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1153
    UINT8 full[24*17];\
1154
    UINT8 half[256];\
1155
    copy_block17(full, src, 24, stride, 17);\
1156
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1157
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1158
}\
1159
static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1160
    UINT8 full[24*17];\
1161
    UINT8 halfH[272];\
1162
    UINT8 halfV[256];\
1163
    UINT8 halfHV[256];\
1164
    copy_block17(full, src, 24, stride, 17);\
1165
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1166
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1167
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1168
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1169
}\
1170
static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1171
    UINT8 full[24*17];\
1172
    UINT8 halfH[272];\
1173
    UINT8 halfV[256];\
1174
    UINT8 halfHV[256];\
1175
    copy_block17(full, src, 24, stride, 17);\
1176
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1177
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1178
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1179
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180
}\
1181
static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1182
    UINT8 full[24*17];\
1183
    UINT8 halfH[272];\
1184
    UINT8 halfV[256];\
1185
    UINT8 halfHV[256];\
1186
    copy_block17(full, src, 24, stride, 17);\
1187
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1188
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1189
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1190
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1191
}\
1192
static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1193
    UINT8 full[24*17];\
1194
    UINT8 halfH[272];\
1195
    UINT8 halfV[256];\
1196
    UINT8 halfHV[256];\
1197
    copy_block17(full, src, 24, stride, 17);\
1198
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1199
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1200
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1201
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1202
}\
1203
static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1204
    UINT8 halfH[272];\
1205
    UINT8 halfHV[256];\
1206
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1207
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1208
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1209
}\
1210
static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1211
    UINT8 halfH[272];\
1212
    UINT8 halfHV[256];\
1213
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1214
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1215
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1216
}\
1217
static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1218
    UINT8 full[24*17];\
1219
    UINT8 halfH[272];\
1220
    UINT8 halfV[256];\
1221
    UINT8 halfHV[256];\
1222
    copy_block17(full, src, 24, stride, 17);\
1223
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1225
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1226
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1227
}\
1228
static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1229
    UINT8 full[24*17];\
1230
    UINT8 halfH[272];\
1231
    UINT8 halfV[256];\
1232
    UINT8 halfHV[256];\
1233
    copy_block17(full, src, 24, stride, 17);\
1234
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1235
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1236
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1237
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1238
}\
1239
static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1240
    UINT8 halfH[272];\
1241
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1242
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1243
}\
1244
qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
1245
  {\
1246
    OPNAME ## qpel16_mc00_c,                                                                   \
1247
    OPNAME ## qpel16_mc10_c,                                                                   \
1248
    OPNAME ## qpel16_mc20_c,                                                                   \
1249
    OPNAME ## qpel16_mc30_c,                                                                   \
1250
    OPNAME ## qpel16_mc01_c,                                                                   \
1251
    OPNAME ## qpel16_mc11_c,                                                                   \
1252
    OPNAME ## qpel16_mc21_c,                                                                   \
1253
    OPNAME ## qpel16_mc31_c,                                                                   \
1254
    OPNAME ## qpel16_mc02_c,                                                                   \
1255
    OPNAME ## qpel16_mc12_c,                                                                   \
1256
    OPNAME ## qpel16_mc22_c,                                                                   \
1257
    OPNAME ## qpel16_mc32_c,                                                                   \
1258
    OPNAME ## qpel16_mc03_c,                                                                   \
1259
    OPNAME ## qpel16_mc13_c,                                                                   \
1260
    OPNAME ## qpel16_mc23_c,                                                                   \
1261
    OPNAME ## qpel16_mc33_c,                                                                   \
1262
  },{\
1263
    OPNAME ## qpel8_mc00_c,                                                                   \
1264
    OPNAME ## qpel8_mc10_c,                                                                   \
1265
    OPNAME ## qpel8_mc20_c,                                                                   \
1266
    OPNAME ## qpel8_mc30_c,                                                                   \
1267
    OPNAME ## qpel8_mc01_c,                                                                   \
1268
    OPNAME ## qpel8_mc11_c,                                                                   \
1269
    OPNAME ## qpel8_mc21_c,                                                                   \
1270
    OPNAME ## qpel8_mc31_c,                                                                   \
1271
    OPNAME ## qpel8_mc02_c,                                                                   \
1272
    OPNAME ## qpel8_mc12_c,                                                                   \
1273
    OPNAME ## qpel8_mc22_c,                                                                   \
1274
    OPNAME ## qpel8_mc32_c,                                                                   \
1275
    OPNAME ## qpel8_mc03_c,                                                                   \
1276
    OPNAME ## qpel8_mc13_c,                                                                   \
1277
    OPNAME ## qpel8_mc23_c,                                                                   \
1278
    OPNAME ## qpel8_mc33_c,                                                                   \
1279
  }\
1280
};
1281

    
1282
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1283
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1284
#define op_put(a, b) a = cm[((b) + 16)>>5]
1285
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1286

    
1287
QPEL_MC(0, put_       , _       , op_put)
1288
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1289
QPEL_MC(0, avg_       , _       , op_avg)
1290
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1291
#undef op_avg
1292
#undef op_avg_no_rnd
1293
#undef op_put
1294
#undef op_put_no_rnd
1295

    
1296
int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1297
{
1298
    int s, i;
1299

    
1300
    s = 0;
1301
    for(i=0;i<16;i++) {
1302
        s += abs(pix1[0] - pix2[0]);
1303
        s += abs(pix1[1] - pix2[1]);
1304
        s += abs(pix1[2] - pix2[2]);
1305
        s += abs(pix1[3] - pix2[3]);
1306
        s += abs(pix1[4] - pix2[4]);
1307
        s += abs(pix1[5] - pix2[5]);
1308
        s += abs(pix1[6] - pix2[6]);
1309
        s += abs(pix1[7] - pix2[7]);
1310
        s += abs(pix1[8] - pix2[8]);
1311
        s += abs(pix1[9] - pix2[9]);
1312
        s += abs(pix1[10] - pix2[10]);
1313
        s += abs(pix1[11] - pix2[11]);
1314
        s += abs(pix1[12] - pix2[12]);
1315
        s += abs(pix1[13] - pix2[13]);
1316
        s += abs(pix1[14] - pix2[14]);
1317
        s += abs(pix1[15] - pix2[15]);
1318
        pix1 += line_size;
1319
        pix2 += line_size;
1320
    }
1321
    return s;
1322
}
1323

    
1324
int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1325
{
1326
    int s, i;
1327

    
1328
    s = 0;
1329
    for(i=0;i<16;i++) {
1330
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1331
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1332
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1333
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1334
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1335
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1336
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1337
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1338
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1339
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1340
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1341
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1342
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1343
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1344
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1345
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1346
        pix1 += line_size;
1347
        pix2 += line_size;
1348
    }
1349
    return s;
1350
}
1351

    
1352
int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1353
{
1354
    int s, i;
1355
    UINT8 *pix3 = pix2 + line_size;
1356

    
1357
    s = 0;
1358
    for(i=0;i<16;i++) {
1359
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1360
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1361
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1362
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1363
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1364
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1365
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1366
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1367
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1368
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1369
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1370
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1371
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1372
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1373
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1374
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1375
        pix1 += line_size;
1376
        pix2 += line_size;
1377
        pix3 += line_size;
1378
    }
1379
    return s;
1380
}
1381

    
1382
int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1383
{
1384
    int s, i;
1385
    UINT8 *pix3 = pix2 + line_size;
1386

    
1387
    s = 0;
1388
    for(i=0;i<16;i++) {
1389
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1390
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1391
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1392
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1393
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1394
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1395
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1396
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1397
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1398
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1399
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1400
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1401
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1402
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1403
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1404
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1405
        pix1 += line_size;
1406
        pix2 += line_size;
1407
        pix3 += line_size;
1408
    }
1409
    return s;
1410
}
1411

    
1412
int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1413
{
1414
    int s, i;
1415

    
1416
    s = 0;
1417
    for(i=0;i<8;i++) {
1418
        s += abs(pix1[0] - pix2[0]);
1419
        s += abs(pix1[1] - pix2[1]);
1420
        s += abs(pix1[2] - pix2[2]);
1421
        s += abs(pix1[3] - pix2[3]);
1422
        s += abs(pix1[4] - pix2[4]);
1423
        s += abs(pix1[5] - pix2[5]);
1424
        s += abs(pix1[6] - pix2[6]);
1425
        s += abs(pix1[7] - pix2[7]);
1426
        pix1 += line_size;
1427
        pix2 += line_size;
1428
    }
1429
    return s;
1430
}
1431

    
1432
int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1433
{
1434
    int s, i;
1435

    
1436
    s = 0;
1437
    for(i=0;i<8;i++) {
1438
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1439
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1440
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1441
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1442
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1443
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1444
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1445
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1446
        pix1 += line_size;
1447
        pix2 += line_size;
1448
    }
1449
    return s;
1450
}
1451

    
1452
int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1453
{
1454
    int s, i;
1455
    UINT8 *pix3 = pix2 + line_size;
1456

    
1457
    s = 0;
1458
    for(i=0;i<8;i++) {
1459
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1460
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1461
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1462
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1463
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1464
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1465
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1466
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1467
        pix1 += line_size;
1468
        pix2 += line_size;
1469
        pix3 += line_size;
1470
    }
1471
    return s;
1472
}
1473

    
1474
int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1475
{
1476
    int s, i;
1477
    UINT8 *pix3 = pix2 + line_size;
1478

    
1479
    s = 0;
1480
    for(i=0;i<8;i++) {
1481
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1482
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1483
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1484
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1485
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1486
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1487
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1488
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1489
        pix1 += line_size;
1490
        pix2 += line_size;
1491
        pix3 += line_size;
1492
    }
1493
    return s;
1494
}
1495

    
1496
/* permute block according so that it corresponds to the MMX idct
1497
   order */
1498
void block_permute(INT16 *block, UINT8 *permutation)
1499
{
1500
        int i;
1501
        INT16 temp[64];
1502

    
1503
        for(i=0; i<64; i++) temp[ permutation[i] ] = block[i];
1504

    
1505
        for(i=0; i<64; i++) block[i] = temp[i];
1506
}
1507

    
1508
void clear_blocks_c(DCTELEM *blocks)
1509
{
1510
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1511
}
1512

    
1513
void dsputil_init(void)
1514
{
1515
    int i;
1516

    
1517
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1518
    for(i=0;i<MAX_NEG_CROP;i++) {
1519
        cropTbl[i] = 0;
1520
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
1521
    }
1522

    
1523
    for(i=0;i<512;i++) {
1524
        squareTbl[i] = (i - 256) * (i - 256);
1525
    }
1526

    
1527
    get_pixels = get_pixels_c;
1528
    diff_pixels = diff_pixels_c;
1529
    put_pixels_clamped = put_pixels_clamped_c;
1530
    add_pixels_clamped = add_pixels_clamped_c;
1531
    gmc1= gmc1_c;
1532
    clear_blocks= clear_blocks_c;
1533
    pix_sum= pix_sum_c;
1534
    pix_norm1= pix_norm1_c;
1535

    
1536
    pix_abs16x16     = pix_abs16x16_c;
1537
    pix_abs16x16_x2  = pix_abs16x16_x2_c;
1538
    pix_abs16x16_y2  = pix_abs16x16_y2_c;
1539
    pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1540
    pix_abs8x8     = pix_abs8x8_c;
1541
    pix_abs8x8_x2  = pix_abs8x8_x2_c;
1542
    pix_abs8x8_y2  = pix_abs8x8_y2_c;
1543
    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1544

    
1545
#ifdef HAVE_MMX
1546
    dsputil_init_mmx();
1547
#endif
1548
#ifdef ARCH_ARMV4L
1549
    dsputil_init_armv4l();
1550
#endif
1551
#ifdef HAVE_MLIB
1552
    dsputil_init_mlib();
1553
#endif
1554
#ifdef ARCH_ALPHA
1555
    dsputil_init_alpha();
1556
#endif
1557
#ifdef ARCH_POWERPC
1558
    dsputil_init_ppc();
1559
#endif
1560
#ifdef HAVE_MMI
1561
    dsputil_init_mmi();
1562
#endif
1563

    
1564
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1565
}
1566

    
1567
/* remove any non bit exact operation (testing purpose) */
1568
void avcodec_set_bit_exact(void)
1569
{
1570
    ff_bit_exact=1;
1571
#ifdef HAVE_MMX
1572
    dsputil_set_bit_exact_mmx();
1573
#endif
1574
}
1575

    
1576
void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1577
              int orig_linesize[3], int coded_linesize,
1578
              AVCodecContext *avctx)
1579
{
1580
    int quad, diff, x, y;
1581
    UINT8 *orig, *coded;
1582
    UINT32 *sq = squareTbl + 256;
1583
    
1584
    quad = 0;
1585
    diff = 0;
1586
    
1587
    /* Luminance */
1588
    orig = orig_image[0];
1589
    coded = coded_image[0];
1590
    
1591
    for (y=0;y<avctx->height;y++) {
1592
        for (x=0;x<avctx->width;x++) {
1593
            diff = *(orig + x) - *(coded + x);
1594
            quad += sq[diff];
1595
        }
1596
        orig += orig_linesize[0];
1597
        coded += coded_linesize;
1598
    }
1599
   
1600
    avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1601
    
1602
    if (avctx->psnr_y) {
1603
        avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1604
        avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1605
    } else
1606
        avctx->psnr_y = 99.99;
1607
}
1608