Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 9f9c3229

History | View | Annotate | Download (69.9 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23

    
24
void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
25
void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
26
void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
27
void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
28
void (*ff_gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
29
void (*ff_gmc )(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy, 
30
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
31
void (*clear_blocks)(DCTELEM *blocks);
32
int (*pix_sum)(UINT8 * pix, int line_size);
33
int (*pix_norm1)(UINT8 * pix, int line_size);
34

    
35
op_pixels_abs_func pix_abs16x16;
36
op_pixels_abs_func pix_abs16x16_x2;
37
op_pixels_abs_func pix_abs16x16_y2;
38
op_pixels_abs_func pix_abs16x16_xy2;
39

    
40
op_pixels_abs_func pix_abs8x8;
41
op_pixels_abs_func pix_abs8x8_x2;
42
op_pixels_abs_func pix_abs8x8_y2;
43
op_pixels_abs_func pix_abs8x8_xy2;
44

    
45
int ff_bit_exact=0;
46

    
47
UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
48
UINT32 squareTbl[512];
49

    
50
const UINT8 ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62
UINT16 __align8 inv_zigzag_direct16[64];
63

    
64
const UINT8 ff_alternate_horizontal_scan[64] = {
65
    0,  1,   2,  3,  8,  9, 16, 17, 
66
    10, 11,  4,  5,  6,  7, 15, 14,
67
    13, 12, 19, 18, 24, 25, 32, 33, 
68
    26, 27, 20, 21, 22, 23, 28, 29,
69
    30, 31, 34, 35, 40, 41, 48, 49, 
70
    42, 43, 36, 37, 38, 39, 44, 45,
71
    46, 47, 50, 51, 56, 57, 58, 59, 
72
    52, 53, 54, 55, 60, 61, 62, 63,
73
};
74

    
75
const UINT8 ff_alternate_vertical_scan[64] = {
76
    0,  8,  16, 24,  1,  9,  2, 10, 
77
    17, 25, 32, 40, 48, 56, 57, 49,
78
    41, 33, 26, 18,  3, 11,  4, 12, 
79
    19, 27, 34, 42, 50, 58, 35, 43,
80
    51, 59, 20, 28,  5, 13,  6, 14, 
81
    21, 29, 36, 44, 52, 60, 37, 45,
82
    53, 61, 22, 30,  7, 15, 23, 31, 
83
    38, 46, 54, 62, 39, 47, 55, 63,
84
};
85

    
86
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87
UINT32 inverse[256]={
88
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
89
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
90
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
91
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
92
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
93
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
94
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
95
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
96
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
97
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
98
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
99
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
100
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
101
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
102
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
103
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
104
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
105
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
106
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
107
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
108
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
109
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
110
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
111
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
112
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
113
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
114
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
115
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
116
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
117
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
118
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
119
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120
};
121

    
122
int pix_sum_c(UINT8 * pix, int line_size)
123
{
124
    int s, i, j;
125

    
126
    s = 0;
127
    for (i = 0; i < 16; i++) {
128
        for (j = 0; j < 16; j += 8) {
129
            s += pix[0];
130
            s += pix[1];
131
            s += pix[2];
132
            s += pix[3];
133
            s += pix[4];
134
            s += pix[5];
135
            s += pix[6];
136
            s += pix[7];
137
            pix += 8;
138
        }
139
        pix += line_size - 16;
140
    }
141
    return s;
142
}
143

    
144
int pix_norm1_c(UINT8 * pix, int line_size)
145
{
146
    int s, i, j;
147
    UINT32 *sq = squareTbl + 256;
148

    
149
    s = 0;
150
    for (i = 0; i < 16; i++) {
151
        for (j = 0; j < 16; j += 8) {
152
            s += sq[pix[0]];
153
            s += sq[pix[1]];
154
            s += sq[pix[2]];
155
            s += sq[pix[3]];
156
            s += sq[pix[4]];
157
            s += sq[pix[5]];
158
            s += sq[pix[6]];
159
            s += sq[pix[7]];
160
            pix += 8;
161
        }
162
        pix += line_size - 16;
163
    }
164
    return s;
165
}
166

    
167

    
168
void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
169
{
170
    int i;
171

    
172
    /* read the pixels */
173
    for(i=0;i<8;i++) {
174
        block[0] = pixels[0];
175
        block[1] = pixels[1];
176
        block[2] = pixels[2];
177
        block[3] = pixels[3];
178
        block[4] = pixels[4];
179
        block[5] = pixels[5];
180
        block[6] = pixels[6];
181
        block[7] = pixels[7];
182
        pixels += line_size;
183
        block += 8;
184
    }
185
}
186

    
187
void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
188
                   int stride){
189
    int i;
190

    
191
    /* read the pixels */
192
    for(i=0;i<8;i++) {
193
        block[0] = s1[0] - s2[0];
194
        block[1] = s1[1] - s2[1];
195
        block[2] = s1[2] - s2[2];
196
        block[3] = s1[3] - s2[3];
197
        block[4] = s1[4] - s2[4];
198
        block[5] = s1[5] - s2[5];
199
        block[6] = s1[6] - s2[6];
200
        block[7] = s1[7] - s2[7];
201
        s1 += stride;
202
        s2 += stride;
203
        block += 8;
204
    }
205
}
206

    
207

    
208
void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
209
                          int line_size)
210
{
211
    int i;
212
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
213
    
214
    /* read the pixels */
215
    for(i=0;i<8;i++) {
216
        pixels[0] = cm[block[0]];
217
        pixels[1] = cm[block[1]];
218
        pixels[2] = cm[block[2]];
219
        pixels[3] = cm[block[3]];
220
        pixels[4] = cm[block[4]];
221
        pixels[5] = cm[block[5]];
222
        pixels[6] = cm[block[6]];
223
        pixels[7] = cm[block[7]];
224

    
225
        pixels += line_size;
226
        block += 8;
227
    }
228
}
229

    
230
void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
231
                          int line_size)
232
{
233
    int i;
234
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
235
    
236
    /* read the pixels */
237
    for(i=0;i<8;i++) {
238
        pixels[0] = cm[pixels[0] + block[0]];
239
        pixels[1] = cm[pixels[1] + block[1]];
240
        pixels[2] = cm[pixels[2] + block[2]];
241
        pixels[3] = cm[pixels[3] + block[3]];
242
        pixels[4] = cm[pixels[4] + block[4]];
243
        pixels[5] = cm[pixels[5] + block[5]];
244
        pixels[6] = cm[pixels[6] + block[6]];
245
        pixels[7] = cm[pixels[7] + block[7]];
246
        pixels += line_size;
247
        block += 8;
248
    }
249
}
250
#if 0
251

252
#define PIXOP2(OPNAME, OP) \
253
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
254
{\
255
    int i;\
256
    for(i=0; i<h; i++){\
257
        OP(*((uint64_t*)block), LD64(pixels));\
258
        pixels+=line_size;\
259
        block +=line_size;\
260
    }\
261
}\
262
\
263
static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
264
{\
265
    int i;\
266
    for(i=0; i<h; i++){\
267
        const uint64_t a= LD64(pixels  );\
268
        const uint64_t b= LD64(pixels+1);\
269
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
270
        pixels+=line_size;\
271
        block +=line_size;\
272
    }\
273
}\
274
\
275
static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
276
{\
277
    int i;\
278
    for(i=0; i<h; i++){\
279
        const uint64_t a= LD64(pixels  );\
280
        const uint64_t b= LD64(pixels+1);\
281
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
282
        pixels+=line_size;\
283
        block +=line_size;\
284
    }\
285
}\
286
\
287
static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
288
{\
289
    int i;\
290
    for(i=0; i<h; i++){\
291
        const uint64_t a= LD64(pixels          );\
292
        const uint64_t b= LD64(pixels+line_size);\
293
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
294
        pixels+=line_size;\
295
        block +=line_size;\
296
    }\
297
}\
298
\
299
static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
300
{\
301
    int i;\
302
    for(i=0; i<h; i++){\
303
        const uint64_t a= LD64(pixels          );\
304
        const uint64_t b= LD64(pixels+line_size);\
305
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
306
        pixels+=line_size;\
307
        block +=line_size;\
308
    }\
309
}\
310
\
311
static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
312
{\
313
        int i;\
314
        const uint64_t a= LD64(pixels  );\
315
        const uint64_t b= LD64(pixels+1);\
316
        uint64_t l0=  (a&0x0303030303030303ULL)\
317
                    + (b&0x0303030303030303ULL)\
318
                    + 0x0202020202020202ULL;\
319
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
320
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
321
        uint64_t l1,h1;\
322
\
323
        pixels+=line_size;\
324
        for(i=0; i<h; i+=2){\
325
            uint64_t a= LD64(pixels  );\
326
            uint64_t b= LD64(pixels+1);\
327
            l1=  (a&0x0303030303030303ULL)\
328
               + (b&0x0303030303030303ULL);\
329
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
330
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
331
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
332
            pixels+=line_size;\
333
            block +=line_size;\
334
            a= LD64(pixels  );\
335
            b= LD64(pixels+1);\
336
            l0=  (a&0x0303030303030303ULL)\
337
               + (b&0x0303030303030303ULL)\
338
               + 0x0202020202020202ULL;\
339
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
342
            pixels+=line_size;\
343
            block +=line_size;\
344
        }\
345
}\
346
\
347
static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
348
{\
349
        int i;\
350
        const uint64_t a= LD64(pixels  );\
351
        const uint64_t b= LD64(pixels+1);\
352
        uint64_t l0=  (a&0x0303030303030303ULL)\
353
                    + (b&0x0303030303030303ULL)\
354
                    + 0x0101010101010101ULL;\
355
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
356
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
357
        uint64_t l1,h1;\
358
\
359
        pixels+=line_size;\
360
        for(i=0; i<h; i+=2){\
361
            uint64_t a= LD64(pixels  );\
362
            uint64_t b= LD64(pixels+1);\
363
            l1=  (a&0x0303030303030303ULL)\
364
               + (b&0x0303030303030303ULL);\
365
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
366
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
367
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
368
            pixels+=line_size;\
369
            block +=line_size;\
370
            a= LD64(pixels  );\
371
            b= LD64(pixels+1);\
372
            l0=  (a&0x0303030303030303ULL)\
373
               + (b&0x0303030303030303ULL)\
374
               + 0x0101010101010101ULL;\
375
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
378
            pixels+=line_size;\
379
            block +=line_size;\
380
        }\
381
}\
382
\
383
CALL_2X_PIXELS(OPNAME ## _pixels16    , OPNAME ## _pixels    , 8)\
384
CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
385
CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
386
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
387
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
388
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
389
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
390
\
391
void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
392
    {\
393
        OPNAME ## _pixels,\
394
        OPNAME ## _pixels_x2,\
395
        OPNAME ## _pixels_y2,\
396
        OPNAME ## _pixels_xy2},\
397
    {\
398
        OPNAME ## _pixels16,\
399
        OPNAME ## _pixels16_x2,\
400
        OPNAME ## _pixels16_y2,\
401
        OPNAME ## _pixels16_xy2}\
402
};\
403
\
404
void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
405
    {\
406
        OPNAME ## _pixels,\
407
        OPNAME ## _no_rnd_pixels_x2,\
408
        OPNAME ## _no_rnd_pixels_y2,\
409
        OPNAME ## _no_rnd_pixels_xy2},\
410
    {\
411
        OPNAME ## _pixels16,\
412
        OPNAME ## _no_rnd_pixels16_x2,\
413
        OPNAME ## _no_rnd_pixels16_y2,\
414
        OPNAME ## _no_rnd_pixels16_xy2}\
415
};
416

417
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
418
#else // 64 bit variant
419

    
420
#define PIXOP2(OPNAME, OP) \
421
static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
422
    int i;\
423
    for(i=0; i<h; i++){\
424
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
425
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
426
        pixels+=line_size;\
427
        block +=line_size;\
428
    }\
429
}\
430
static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
431
    OPNAME ## _pixels8(block, pixels, line_size, h);\
432
}\
433
\
434
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
435
                                                int src_stride1, int src_stride2, int h){\
436
    int i;\
437
    for(i=0; i<h; i++){\
438
        uint32_t a,b;\
439
        a= LD32(&src1[i*src_stride1  ]);\
440
        b= LD32(&src2[i*src_stride2  ]);\
441
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
442
        a= LD32(&src1[i*src_stride1+4]);\
443
        b= LD32(&src2[i*src_stride2+4]);\
444
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
445
    }\
446
}\
447
\
448
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
449
                                                int src_stride1, int src_stride2, int h){\
450
    int i;\
451
    for(i=0; i<h; i++){\
452
        uint32_t a,b;\
453
        a= LD32(&src1[i*src_stride1  ]);\
454
        b= LD32(&src2[i*src_stride2  ]);\
455
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
456
        a= LD32(&src1[i*src_stride1+4]);\
457
        b= LD32(&src2[i*src_stride2+4]);\
458
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
459
    }\
460
}\
461
\
462
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
463
                                                int src_stride1, int src_stride2, int h){\
464
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
465
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
466
}\
467
\
468
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
469
                                                int src_stride1, int src_stride2, int h){\
470
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
471
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
472
}\
473
\
474
static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
475
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
476
}\
477
\
478
static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
479
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
480
}\
481
\
482
static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
483
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
484
}\
485
\
486
static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
488
}\
489
\
490
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
491
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
492
    int i;\
493
    for(i=0; i<h; i++){\
494
        uint32_t a, b, c, d, l0, l1, h0, h1;\
495
        a= LD32(&src1[i*src_stride1]);\
496
        b= LD32(&src2[i*src_stride2]);\
497
        c= LD32(&src3[i*src_stride3]);\
498
        d= LD32(&src4[i*src_stride4]);\
499
        l0=  (a&0x03030303UL)\
500
           + (b&0x03030303UL)\
501
           + 0x02020202UL;\
502
        h0= ((a&0xFCFCFCFCUL)>>2)\
503
          + ((b&0xFCFCFCFCUL)>>2);\
504
        l1=  (c&0x03030303UL)\
505
           + (d&0x03030303UL);\
506
        h1= ((c&0xFCFCFCFCUL)>>2)\
507
          + ((d&0xFCFCFCFCUL)>>2);\
508
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
509
        a= LD32(&src1[i*src_stride1+4]);\
510
        b= LD32(&src2[i*src_stride2+4]);\
511
        c= LD32(&src3[i*src_stride3+4]);\
512
        d= LD32(&src4[i*src_stride4+4]);\
513
        l0=  (a&0x03030303UL)\
514
           + (b&0x03030303UL)\
515
           + 0x02020202UL;\
516
        h0= ((a&0xFCFCFCFCUL)>>2)\
517
          + ((b&0xFCFCFCFCUL)>>2);\
518
        l1=  (c&0x03030303UL)\
519
           + (d&0x03030303UL);\
520
        h1= ((c&0xFCFCFCFCUL)>>2)\
521
          + ((d&0xFCFCFCFCUL)>>2);\
522
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
523
    }\
524
}\
525
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
526
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
527
    int i;\
528
    for(i=0; i<h; i++){\
529
        uint32_t a, b, c, d, l0, l1, h0, h1;\
530
        a= LD32(&src1[i*src_stride1]);\
531
        b= LD32(&src2[i*src_stride2]);\
532
        c= LD32(&src3[i*src_stride3]);\
533
        d= LD32(&src4[i*src_stride4]);\
534
        l0=  (a&0x03030303UL)\
535
           + (b&0x03030303UL)\
536
           + 0x01010101UL;\
537
        h0= ((a&0xFCFCFCFCUL)>>2)\
538
          + ((b&0xFCFCFCFCUL)>>2);\
539
        l1=  (c&0x03030303UL)\
540
           + (d&0x03030303UL);\
541
        h1= ((c&0xFCFCFCFCUL)>>2)\
542
          + ((d&0xFCFCFCFCUL)>>2);\
543
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
544
        a= LD32(&src1[i*src_stride1+4]);\
545
        b= LD32(&src2[i*src_stride2+4]);\
546
        c= LD32(&src3[i*src_stride3+4]);\
547
        d= LD32(&src4[i*src_stride4+4]);\
548
        l0=  (a&0x03030303UL)\
549
           + (b&0x03030303UL)\
550
           + 0x01010101UL;\
551
        h0= ((a&0xFCFCFCFCUL)>>2)\
552
          + ((b&0xFCFCFCFCUL)>>2);\
553
        l1=  (c&0x03030303UL)\
554
           + (d&0x03030303UL);\
555
        h1= ((c&0xFCFCFCFCUL)>>2)\
556
          + ((d&0xFCFCFCFCUL)>>2);\
557
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
558
    }\
559
}\
560
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
561
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
562
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
563
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
564
}\
565
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
566
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
567
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
568
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
569
}\
570
\
571
static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
572
{\
573
    int j;\
574
    for(j=0; j<2; j++){\
575
        int i;\
576
        const uint32_t a= LD32(pixels  );\
577
        const uint32_t b= LD32(pixels+1);\
578
        uint32_t l0=  (a&0x03030303UL)\
579
                    + (b&0x03030303UL)\
580
                    + 0x02020202UL;\
581
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
582
                   + ((b&0xFCFCFCFCUL)>>2);\
583
        uint32_t l1,h1;\
584
\
585
        pixels+=line_size;\
586
        for(i=0; i<h; i+=2){\
587
            uint32_t a= LD32(pixels  );\
588
            uint32_t b= LD32(pixels+1);\
589
            l1=  (a&0x03030303UL)\
590
               + (b&0x03030303UL);\
591
            h1= ((a&0xFCFCFCFCUL)>>2)\
592
              + ((b&0xFCFCFCFCUL)>>2);\
593
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
594
            pixels+=line_size;\
595
            block +=line_size;\
596
            a= LD32(pixels  );\
597
            b= LD32(pixels+1);\
598
            l0=  (a&0x03030303UL)\
599
               + (b&0x03030303UL)\
600
               + 0x02020202UL;\
601
            h0= ((a&0xFCFCFCFCUL)>>2)\
602
              + ((b&0xFCFCFCFCUL)>>2);\
603
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
604
            pixels+=line_size;\
605
            block +=line_size;\
606
        }\
607
        pixels+=4-line_size*(h+1);\
608
        block +=4-line_size*h;\
609
    }\
610
}\
611
\
612
static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
613
{\
614
    int j;\
615
    for(j=0; j<2; j++){\
616
        int i;\
617
        const uint32_t a= LD32(pixels  );\
618
        const uint32_t b= LD32(pixels+1);\
619
        uint32_t l0=  (a&0x03030303UL)\
620
                    + (b&0x03030303UL)\
621
                    + 0x01010101UL;\
622
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
623
                   + ((b&0xFCFCFCFCUL)>>2);\
624
        uint32_t l1,h1;\
625
\
626
        pixels+=line_size;\
627
        for(i=0; i<h; i+=2){\
628
            uint32_t a= LD32(pixels  );\
629
            uint32_t b= LD32(pixels+1);\
630
            l1=  (a&0x03030303UL)\
631
               + (b&0x03030303UL);\
632
            h1= ((a&0xFCFCFCFCUL)>>2)\
633
              + ((b&0xFCFCFCFCUL)>>2);\
634
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
635
            pixels+=line_size;\
636
            block +=line_size;\
637
            a= LD32(pixels  );\
638
            b= LD32(pixels+1);\
639
            l0=  (a&0x03030303UL)\
640
               + (b&0x03030303UL)\
641
               + 0x01010101UL;\
642
            h0= ((a&0xFCFCFCFCUL)>>2)\
643
              + ((b&0xFCFCFCFCUL)>>2);\
644
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
645
            pixels+=line_size;\
646
            block +=line_size;\
647
        }\
648
        pixels+=4-line_size*(h+1);\
649
        block +=4-line_size*h;\
650
    }\
651
}\
652
\
653
CALL_2X_PIXELS(OPNAME ## _pixels16    , OPNAME ## _pixels8    , 8)\
654
CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
655
CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
656
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
657
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16    , OPNAME ## _pixels8    , 8)\
658
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
659
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
660
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
661
\
662
void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
663
    {\
664
        OPNAME ## _pixels16,\
665
        OPNAME ## _pixels16_x2,\
666
        OPNAME ## _pixels16_y2,\
667
        OPNAME ## _pixels16_xy2},\
668
    {\
669
        OPNAME ## _pixels8,\
670
        OPNAME ## _pixels8_x2,\
671
        OPNAME ## _pixels8_y2,\
672
        OPNAME ## _pixels8_xy2},\
673
};\
674
\
675
void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
676
    {\
677
        OPNAME ## _pixels16,\
678
        OPNAME ## _no_rnd_pixels16_x2,\
679
        OPNAME ## _no_rnd_pixels16_y2,\
680
        OPNAME ## _no_rnd_pixels16_xy2},\
681
    {\
682
        OPNAME ## _pixels8,\
683
        OPNAME ## _no_rnd_pixels8_x2,\
684
        OPNAME ## _no_rnd_pixels8_y2,\
685
        OPNAME ## _no_rnd_pixels8_xy2},\
686
};
687

    
688
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
689
#endif
690
#define op_put(a, b) a = b
691

    
692
PIXOP2(avg, op_avg)
693
PIXOP2(put, op_put)
694
#undef op_avg
695
#undef op_put
696

    
697
#if 0
698
/* FIXME this stuff could be removed as its ot really used anymore */
699
#define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
700
                                                                                         \
701
static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
702
{                                                                                        \
703
    BTYPE *p;                                                                            \
704
    const UINT8 *pix;                                                                    \
705
                                                                                         \
706
    p = block;                                                                           \
707
    pix = pixels;                                                                        \
708
    do {                                                                                 \
709
        OP(p[0], pix[0]);                                                                  \
710
        OP(p[1], pix[1]);                                                                  \
711
        OP(p[2], pix[2]);                                                                  \
712
        OP(p[3], pix[3]);                                                                  \
713
        OP(p[4], pix[4]);                                                                  \
714
        OP(p[5], pix[5]);                                                                  \
715
        OP(p[6], pix[6]);                                                                  \
716
        OP(p[7], pix[7]);                                                                  \
717
        pix += line_size;                                                                \
718
        p += INCR;                                                                       \
719
    } while (--h);;                                                                       \
720
}                                                                                        \
721
                                                                                         \
722
static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
723
{                                                                                        \
724
    BTYPE *p;                                                                          \
725
    const UINT8 *pix;                                                                    \
726
                                                                                         \
727
    p = block;                                                                           \
728
    pix = pixels;                                                                        \
729
    do {                                                                   \
730
        OP(p[0], avg2(pix[0], pix[1]));                                                    \
731
        OP(p[1], avg2(pix[1], pix[2]));                                                    \
732
        OP(p[2], avg2(pix[2], pix[3]));                                                    \
733
        OP(p[3], avg2(pix[3], pix[4]));                                                    \
734
        OP(p[4], avg2(pix[4], pix[5]));                                                    \
735
        OP(p[5], avg2(pix[5], pix[6]));                                                    \
736
        OP(p[6], avg2(pix[6], pix[7]));                                                    \
737
        OP(p[7], avg2(pix[7], pix[8]));                                                    \
738
        pix += line_size;                                                                \
739
        p += INCR;                                                                       \
740
    } while (--h);                                                                        \
741
}                                                                                        \
742
                                                                                         \
743
static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
744
{                                                                                        \
745
    BTYPE *p;                                                                          \
746
    const UINT8 *pix;                                                                    \
747
    const UINT8 *pix1;                                                                   \
748
                                                                                         \
749
    p = block;                                                                           \
750
    pix = pixels;                                                                        \
751
    pix1 = pixels + line_size;                                                           \
752
    do {                                                                                 \
753
        OP(p[0], avg2(pix[0], pix1[0]));                                                   \
754
        OP(p[1], avg2(pix[1], pix1[1]));                                                   \
755
        OP(p[2], avg2(pix[2], pix1[2]));                                                   \
756
        OP(p[3], avg2(pix[3], pix1[3]));                                                   \
757
        OP(p[4], avg2(pix[4], pix1[4]));                                                   \
758
        OP(p[5], avg2(pix[5], pix1[5]));                                                   \
759
        OP(p[6], avg2(pix[6], pix1[6]));                                                   \
760
        OP(p[7], avg2(pix[7], pix1[7]));                                                   \
761
        pix += line_size;                                                                \
762
        pix1 += line_size;                                                               \
763
        p += INCR;                                                                       \
764
    } while(--h);                                                                         \
765
}                                                                                        \
766
                                                                                         \
767
static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
768
{                                                                                        \
769
    BTYPE *p;                                                                          \
770
    const UINT8 *pix;                                                                    \
771
    const UINT8 *pix1;                                                                   \
772
                                                                                         \
773
    p = block;                                                                           \
774
    pix = pixels;                                                                        \
775
    pix1 = pixels + line_size;                                                           \
776
    do {                                                                   \
777
        OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
778
        OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
779
        OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
780
        OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
781
        OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
782
        OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
783
        OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
784
        OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
785
        pix += line_size;                                                                \
786
        pix1 += line_size;                                                               \
787
        p += INCR;                                                                       \
788
    } while(--h);                                                                         \
789
}                                                                                        \
790
                                                                                         \
791
void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
792
    OPNAME ## _pixels,                                                                   \
793
    OPNAME ## _pixels_x2,                                                                \
794
    OPNAME ## _pixels_y2,                                                                \
795
    OPNAME ## _pixels_xy2,                                                               \
796
};
797

798
/* rounding primitives */
799
#define avg2(a,b) ((a+b+1)>>1)
800
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
801

802
#define op_avg(a, b) a = avg2(a, b)
803
#define op_sub(a, b) a -= b
804
#define op_put(a, b) a = b
805

806
PIXOP(DCTELEM, sub, op_sub, 8)
807
PIXOP(uint8_t, avg, op_avg, line_size)
808
PIXOP(uint8_t, put, op_put, line_size)
809

810
/* not rounding primitives */
811
#undef avg2
812
#undef avg4
813
#define avg2(a,b) ((a+b)>>1)
814
#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
815

816
PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
817
PIXOP(uint8_t, put_no_rnd, op_put, line_size)
818
/* motion estimation */
819

820
#undef avg2
821
#undef avg4
822
#endif
823

    
824
#define avg2(a,b) ((a+b+1)>>1)
825
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
826

    
827

    
828
static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
829
{
830
    const int A=(16-x16)*(16-y16);
831
    const int B=(   x16)*(16-y16);
832
    const int C=(16-x16)*(   y16);
833
    const int D=(   x16)*(   y16);
834
    int i;
835

    
836
    for(i=0; i<h; i++)
837
    {
838
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
839
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
840
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
841
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
842
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
843
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
844
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
845
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
846
        dst+= stride;
847
        src+= stride;
848
    }
849
}
850

    
851
static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy, 
852
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
853
{
854
    int y, vx, vy;
855
    const int s= 1<<shift;
856
    
857
    width--;
858
    height--;
859

    
860
    for(y=0; y<h; y++){
861
        int x;
862

    
863
        vx= ox;
864
        vy= oy;
865
        for(x=0; x<8; x++){ //XXX FIXME optimize
866
            int src_x, src_y, frac_x, frac_y, index;
867

    
868
            src_x= vx>>16;
869
            src_y= vy>>16;
870
            frac_x= src_x&(s-1);
871
            frac_y= src_y&(s-1);
872
            src_x>>=shift;
873
            src_y>>=shift;
874
  
875
            if((unsigned)src_x < width){
876
                if((unsigned)src_y < height){
877
                    index= src_x + src_y*stride;
878
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
879
                                           + src[index       +1]*   frac_x )*(s-frac_y)
880
                                        + (  src[index+stride  ]*(s-frac_x)
881
                                           + src[index+stride+1]*   frac_x )*   frac_y
882
                                        + r)>>(shift*2);
883
                }else{
884
                    index= src_x + clip(src_y, 0, height)*stride;                    
885
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
886
                                          + src[index       +1]*   frac_x )*s
887
                                        + r)>>(shift*2);
888
                }
889
            }else{
890
                if((unsigned)src_y < height){
891
                    index= clip(src_x, 0, width) + src_y*stride;                    
892
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
893
                                           + src[index+stride  ]*   frac_y )*s
894
                                        + r)>>(shift*2);
895
                }else{
896
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
897
                    dst[y*stride + x]=    src[index         ];
898
                }
899
            }
900
            
901
            vx+= dxx;
902
            vy+= dyx;
903
        }
904
        ox += dxy;
905
        oy += dyy;
906
    }
907
}
908

    
909
static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
910
{
911
    int i;
912
    for(i=0; i<h; i++)
913
    {
914
        ST32(dst   , LD32(src   ));
915
        ST32(dst+4 , LD32(src+4 ));
916
        ST32(dst+8 , LD32(src+8 ));
917
        ST32(dst+12, LD32(src+12));
918
        dst[16]= src[16];
919
        dst+=dstStride;
920
        src+=srcStride;
921
    }
922
}
923

    
924
static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
925
{
926
    int i;
927
    for(i=0; i<h; i++)
928
    {
929
        ST32(dst   , LD32(src   ));
930
        ST32(dst+4 , LD32(src+4 ));
931
        dst[8]= src[8];
932
        dst+=dstStride;
933
        src+=srcStride;
934
    }
935
}
936

    
937
#define QPEL_MC(r, OPNAME, RND, OP) \
938
static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
939
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
940
    int i;\
941
    for(i=0; i<h; i++)\
942
    {\
943
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
944
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
945
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
946
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
947
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
948
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
949
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
950
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
951
        dst+=dstStride;\
952
        src+=srcStride;\
953
    }\
954
}\
955
\
956
static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
957
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
958
    int i;\
959
    for(i=0; i<w; i++)\
960
    {\
961
        const int src0= src[0*srcStride];\
962
        const int src1= src[1*srcStride];\
963
        const int src2= src[2*srcStride];\
964
        const int src3= src[3*srcStride];\
965
        const int src4= src[4*srcStride];\
966
        const int src5= src[5*srcStride];\
967
        const int src6= src[6*srcStride];\
968
        const int src7= src[7*srcStride];\
969
        const int src8= src[8*srcStride];\
970
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
971
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
972
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
973
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
974
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
975
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
976
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
977
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
978
        dst++;\
979
        src++;\
980
    }\
981
}\
982
\
983
static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
984
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
985
    int i;\
986
    for(i=0; i<h; i++)\
987
    {\
988
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
989
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
990
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
991
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
992
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
993
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
994
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
995
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
996
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
997
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
998
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
999
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1000
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1001
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1002
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1003
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1004
        dst+=dstStride;\
1005
        src+=srcStride;\
1006
    }\
1007
}\
1008
\
1009
static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
1010
    UINT8 *cm = cropTbl + MAX_NEG_CROP;\
1011
    int i;\
1012
    for(i=0; i<w; i++)\
1013
    {\
1014
        const int src0= src[0*srcStride];\
1015
        const int src1= src[1*srcStride];\
1016
        const int src2= src[2*srcStride];\
1017
        const int src3= src[3*srcStride];\
1018
        const int src4= src[4*srcStride];\
1019
        const int src5= src[5*srcStride];\
1020
        const int src6= src[6*srcStride];\
1021
        const int src7= src[7*srcStride];\
1022
        const int src8= src[8*srcStride];\
1023
        const int src9= src[9*srcStride];\
1024
        const int src10= src[10*srcStride];\
1025
        const int src11= src[11*srcStride];\
1026
        const int src12= src[12*srcStride];\
1027
        const int src13= src[13*srcStride];\
1028
        const int src14= src[14*srcStride];\
1029
        const int src15= src[15*srcStride];\
1030
        const int src16= src[16*srcStride];\
1031
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1032
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1033
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1034
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1035
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1036
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1037
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1038
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1039
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1040
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1041
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1042
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1043
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1044
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1045
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1046
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1047
        dst++;\
1048
        src++;\
1049
    }\
1050
}\
1051
\
1052
static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1053
    OPNAME ## pixels8(dst, src, stride, 8);\
1054
}\
1055
\
1056
static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1057
    UINT8 half[64];\
1058
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1059
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1060
}\
1061
\
1062
static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1063
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1064
}\
1065
\
1066
static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1067
    UINT8 half[64];\
1068
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1069
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1070
}\
1071
\
1072
static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1073
    UINT8 full[16*9];\
1074
    UINT8 half[64];\
1075
    copy_block9(full, src, 16, stride, 9);\
1076
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1077
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1078
}\
1079
\
1080
static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1081
    UINT8 full[16*9];\
1082
    copy_block9(full, src, 16, stride, 9);\
1083
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
1084
}\
1085
\
1086
static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1087
    UINT8 full[16*9];\
1088
    UINT8 half[64];\
1089
    copy_block9(full, src, 16, stride, 9);\
1090
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1091
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1092
}\
1093
static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1094
    UINT8 full[16*9];\
1095
    UINT8 halfH[72];\
1096
    UINT8 halfV[64];\
1097
    UINT8 halfHV[64];\
1098
    copy_block9(full, src, 16, stride, 9);\
1099
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1100
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1101
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1102
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1103
}\
1104
static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1105
    UINT8 full[16*9];\
1106
    UINT8 halfH[72];\
1107
    UINT8 halfV[64];\
1108
    UINT8 halfHV[64];\
1109
    copy_block9(full, src, 16, stride, 9);\
1110
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1112
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1113
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1114
}\
1115
static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1116
    UINT8 full[16*9];\
1117
    UINT8 halfH[72];\
1118
    UINT8 halfV[64];\
1119
    UINT8 halfHV[64];\
1120
    copy_block9(full, src, 16, stride, 9);\
1121
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1122
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1123
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1124
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1125
}\
1126
static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1127
    UINT8 full[16*9];\
1128
    UINT8 halfH[72];\
1129
    UINT8 halfV[64];\
1130
    UINT8 halfHV[64];\
1131
    copy_block9(full, src, 16, stride, 9);\
1132
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1133
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1134
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1135
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1136
}\
1137
static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1138
    UINT8 halfH[72];\
1139
    UINT8 halfHV[64];\
1140
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1141
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1142
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1143
}\
1144
static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1145
    UINT8 halfH[72];\
1146
    UINT8 halfHV[64];\
1147
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1148
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1149
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1150
}\
1151
static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1152
    UINT8 full[16*9];\
1153
    UINT8 halfH[72];\
1154
    UINT8 halfV[64];\
1155
    UINT8 halfHV[64];\
1156
    copy_block9(full, src, 16, stride, 9);\
1157
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1158
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1159
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1160
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1161
}\
1162
static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1163
    UINT8 full[16*9];\
1164
    UINT8 halfH[72];\
1165
    UINT8 halfV[64];\
1166
    UINT8 halfHV[64];\
1167
    copy_block9(full, src, 16, stride, 9);\
1168
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1169
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1170
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1171
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1172
}\
1173
static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1174
    UINT8 halfH[72];\
1175
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1176
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1177
}\
1178
static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1179
    OPNAME ## pixels16(dst, src, stride, 16);\
1180
}\
1181
\
1182
static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1183
    UINT8 half[256];\
1184
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1185
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1186
}\
1187
\
1188
static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1189
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1190
}\
1191
\
1192
static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1193
    UINT8 half[256];\
1194
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1195
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1196
}\
1197
\
1198
static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1199
    UINT8 full[24*17];\
1200
    UINT8 half[256];\
1201
    copy_block17(full, src, 24, stride, 17);\
1202
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1203
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1204
}\
1205
\
1206
static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1207
    UINT8 full[24*17];\
1208
    copy_block17(full, src, 24, stride, 17);\
1209
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1210
}\
1211
\
1212
static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1213
    UINT8 full[24*17];\
1214
    UINT8 half[256];\
1215
    copy_block17(full, src, 24, stride, 17);\
1216
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1217
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1218
}\
1219
static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1220
    UINT8 full[24*17];\
1221
    UINT8 halfH[272];\
1222
    UINT8 halfV[256];\
1223
    UINT8 halfHV[256];\
1224
    copy_block17(full, src, 24, stride, 17);\
1225
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1227
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1228
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1229
}\
1230
static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1231
    UINT8 full[24*17];\
1232
    UINT8 halfH[272];\
1233
    UINT8 halfV[256];\
1234
    UINT8 halfHV[256];\
1235
    copy_block17(full, src, 24, stride, 17);\
1236
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1238
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1239
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1240
}\
1241
static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1242
    UINT8 full[24*17];\
1243
    UINT8 halfH[272];\
1244
    UINT8 halfV[256];\
1245
    UINT8 halfHV[256];\
1246
    copy_block17(full, src, 24, stride, 17);\
1247
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1248
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1249
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1250
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1251
}\
1252
static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1253
    UINT8 full[24*17];\
1254
    UINT8 halfH[272];\
1255
    UINT8 halfV[256];\
1256
    UINT8 halfHV[256];\
1257
    copy_block17(full, src, 24, stride, 17);\
1258
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1259
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1260
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1261
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1262
}\
1263
static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1264
    UINT8 halfH[272];\
1265
    UINT8 halfHV[256];\
1266
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1268
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1269
}\
1270
static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1271
    UINT8 halfH[272];\
1272
    UINT8 halfHV[256];\
1273
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1274
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1275
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1276
}\
1277
static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1278
    UINT8 full[24*17];\
1279
    UINT8 halfH[272];\
1280
    UINT8 halfV[256];\
1281
    UINT8 halfHV[256];\
1282
    copy_block17(full, src, 24, stride, 17);\
1283
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1284
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1285
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1286
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1287
}\
1288
static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1289
    UINT8 full[24*17];\
1290
    UINT8 halfH[272];\
1291
    UINT8 halfV[256];\
1292
    UINT8 halfHV[256];\
1293
    copy_block17(full, src, 24, stride, 17);\
1294
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1295
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1296
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1297
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1298
}\
1299
static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1300
    UINT8 halfH[272];\
1301
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1302
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1303
}\
1304
qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
1305
  {\
1306
    OPNAME ## qpel16_mc00_c,                                                                   \
1307
    OPNAME ## qpel16_mc10_c,                                                                   \
1308
    OPNAME ## qpel16_mc20_c,                                                                   \
1309
    OPNAME ## qpel16_mc30_c,                                                                   \
1310
    OPNAME ## qpel16_mc01_c,                                                                   \
1311
    OPNAME ## qpel16_mc11_c,                                                                   \
1312
    OPNAME ## qpel16_mc21_c,                                                                   \
1313
    OPNAME ## qpel16_mc31_c,                                                                   \
1314
    OPNAME ## qpel16_mc02_c,                                                                   \
1315
    OPNAME ## qpel16_mc12_c,                                                                   \
1316
    OPNAME ## qpel16_mc22_c,                                                                   \
1317
    OPNAME ## qpel16_mc32_c,                                                                   \
1318
    OPNAME ## qpel16_mc03_c,                                                                   \
1319
    OPNAME ## qpel16_mc13_c,                                                                   \
1320
    OPNAME ## qpel16_mc23_c,                                                                   \
1321
    OPNAME ## qpel16_mc33_c,                                                                   \
1322
  },{\
1323
    OPNAME ## qpel8_mc00_c,                                                                   \
1324
    OPNAME ## qpel8_mc10_c,                                                                   \
1325
    OPNAME ## qpel8_mc20_c,                                                                   \
1326
    OPNAME ## qpel8_mc30_c,                                                                   \
1327
    OPNAME ## qpel8_mc01_c,                                                                   \
1328
    OPNAME ## qpel8_mc11_c,                                                                   \
1329
    OPNAME ## qpel8_mc21_c,                                                                   \
1330
    OPNAME ## qpel8_mc31_c,                                                                   \
1331
    OPNAME ## qpel8_mc02_c,                                                                   \
1332
    OPNAME ## qpel8_mc12_c,                                                                   \
1333
    OPNAME ## qpel8_mc22_c,                                                                   \
1334
    OPNAME ## qpel8_mc32_c,                                                                   \
1335
    OPNAME ## qpel8_mc03_c,                                                                   \
1336
    OPNAME ## qpel8_mc13_c,                                                                   \
1337
    OPNAME ## qpel8_mc23_c,                                                                   \
1338
    OPNAME ## qpel8_mc33_c,                                                                   \
1339
  }\
1340
};
1341

    
1342
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1343
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1344
#define op_put(a, b) a = cm[((b) + 16)>>5]
1345
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1346

    
1347
QPEL_MC(0, put_       , _       , op_put)
1348
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1349
QPEL_MC(0, avg_       , _       , op_avg)
1350
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1351
#undef op_avg
1352
#undef op_avg_no_rnd
1353
#undef op_put
1354
#undef op_put_no_rnd
1355

    
1356
int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1357
{
1358
    int s, i;
1359

    
1360
    s = 0;
1361
    for(i=0;i<16;i++) {
1362
        s += abs(pix1[0] - pix2[0]);
1363
        s += abs(pix1[1] - pix2[1]);
1364
        s += abs(pix1[2] - pix2[2]);
1365
        s += abs(pix1[3] - pix2[3]);
1366
        s += abs(pix1[4] - pix2[4]);
1367
        s += abs(pix1[5] - pix2[5]);
1368
        s += abs(pix1[6] - pix2[6]);
1369
        s += abs(pix1[7] - pix2[7]);
1370
        s += abs(pix1[8] - pix2[8]);
1371
        s += abs(pix1[9] - pix2[9]);
1372
        s += abs(pix1[10] - pix2[10]);
1373
        s += abs(pix1[11] - pix2[11]);
1374
        s += abs(pix1[12] - pix2[12]);
1375
        s += abs(pix1[13] - pix2[13]);
1376
        s += abs(pix1[14] - pix2[14]);
1377
        s += abs(pix1[15] - pix2[15]);
1378
        pix1 += line_size;
1379
        pix2 += line_size;
1380
    }
1381
    return s;
1382
}
1383

    
1384
int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1385
{
1386
    int s, i;
1387

    
1388
    s = 0;
1389
    for(i=0;i<16;i++) {
1390
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1391
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1392
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1393
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1394
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1395
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1396
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1397
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1398
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1399
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1400
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1401
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1402
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1403
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1404
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1405
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1406
        pix1 += line_size;
1407
        pix2 += line_size;
1408
    }
1409
    return s;
1410
}
1411

    
1412
int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1413
{
1414
    int s, i;
1415
    UINT8 *pix3 = pix2 + line_size;
1416

    
1417
    s = 0;
1418
    for(i=0;i<16;i++) {
1419
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1420
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1421
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1422
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1423
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1424
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1425
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1426
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1427
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1428
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1429
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1430
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1431
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1432
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1433
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1434
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1435
        pix1 += line_size;
1436
        pix2 += line_size;
1437
        pix3 += line_size;
1438
    }
1439
    return s;
1440
}
1441

    
1442
int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1443
{
1444
    int s, i;
1445
    UINT8 *pix3 = pix2 + line_size;
1446

    
1447
    s = 0;
1448
    for(i=0;i<16;i++) {
1449
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1450
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1451
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1452
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1453
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1454
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1455
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1456
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1457
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1458
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1459
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1460
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1461
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1462
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1463
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1464
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1465
        pix1 += line_size;
1466
        pix2 += line_size;
1467
        pix3 += line_size;
1468
    }
1469
    return s;
1470
}
1471

    
1472
int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1473
{
1474
    int s, i;
1475

    
1476
    s = 0;
1477
    for(i=0;i<8;i++) {
1478
        s += abs(pix1[0] - pix2[0]);
1479
        s += abs(pix1[1] - pix2[1]);
1480
        s += abs(pix1[2] - pix2[2]);
1481
        s += abs(pix1[3] - pix2[3]);
1482
        s += abs(pix1[4] - pix2[4]);
1483
        s += abs(pix1[5] - pix2[5]);
1484
        s += abs(pix1[6] - pix2[6]);
1485
        s += abs(pix1[7] - pix2[7]);
1486
        pix1 += line_size;
1487
        pix2 += line_size;
1488
    }
1489
    return s;
1490
}
1491

    
1492
int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1493
{
1494
    int s, i;
1495

    
1496
    s = 0;
1497
    for(i=0;i<8;i++) {
1498
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1499
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1500
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1501
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1502
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1503
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1504
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1505
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1506
        pix1 += line_size;
1507
        pix2 += line_size;
1508
    }
1509
    return s;
1510
}
1511

    
1512
int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1513
{
1514
    int s, i;
1515
    UINT8 *pix3 = pix2 + line_size;
1516

    
1517
    s = 0;
1518
    for(i=0;i<8;i++) {
1519
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1520
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1521
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1522
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1523
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1524
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1525
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1526
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1527
        pix1 += line_size;
1528
        pix2 += line_size;
1529
        pix3 += line_size;
1530
    }
1531
    return s;
1532
}
1533

    
1534
int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1535
{
1536
    int s, i;
1537
    UINT8 *pix3 = pix2 + line_size;
1538

    
1539
    s = 0;
1540
    for(i=0;i<8;i++) {
1541
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1542
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1543
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1544
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1545
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1546
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1547
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1548
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1549
        pix1 += line_size;
1550
        pix2 += line_size;
1551
        pix3 += line_size;
1552
    }
1553
    return s;
1554
}
1555

    
1556
void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
1557
{
1558
    int i;
1559
    INT16 temp[64];
1560
    
1561
    if(last<=0) return;
1562
    if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1563

    
1564
    for(i=0; i<=last; i++){
1565
        const int j= scantable[i];
1566
        temp[j]= block[j];
1567
        block[j]=0;
1568
    }
1569
    
1570
    for(i=0; i<=last; i++){
1571
        const int j= scantable[i];
1572
        const int perm_j= permutation[j];
1573
        block[perm_j]= temp[j];
1574
    }
1575
}
1576

    
1577
void clear_blocks_c(DCTELEM *blocks)
1578
{
1579
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1580
}
1581

    
1582
void dsputil_init(void)
1583
{
1584
    int i;
1585

    
1586
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1587
    for(i=0;i<MAX_NEG_CROP;i++) {
1588
        cropTbl[i] = 0;
1589
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
1590
    }
1591

    
1592
    for(i=0;i<512;i++) {
1593
        squareTbl[i] = (i - 256) * (i - 256);
1594
    }
1595

    
1596
    get_pixels = get_pixels_c;
1597
    diff_pixels = diff_pixels_c;
1598
    put_pixels_clamped = put_pixels_clamped_c;
1599
    add_pixels_clamped = add_pixels_clamped_c;
1600
    ff_gmc1= gmc1_c;
1601
    ff_gmc= gmc_c;
1602
    clear_blocks= clear_blocks_c;
1603
    pix_sum= pix_sum_c;
1604
    pix_norm1= pix_norm1_c;
1605

    
1606
    pix_abs16x16     = pix_abs16x16_c;
1607
    pix_abs16x16_x2  = pix_abs16x16_x2_c;
1608
    pix_abs16x16_y2  = pix_abs16x16_y2_c;
1609
    pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1610
    pix_abs8x8     = pix_abs8x8_c;
1611
    pix_abs8x8_x2  = pix_abs8x8_x2_c;
1612
    pix_abs8x8_y2  = pix_abs8x8_y2_c;
1613
    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1614

    
1615
#ifdef HAVE_MMX
1616
    dsputil_init_mmx();
1617
#endif
1618
#ifdef ARCH_ARMV4L
1619
    dsputil_init_armv4l();
1620
#endif
1621
#ifdef HAVE_MLIB
1622
    dsputil_init_mlib();
1623
#endif
1624
#ifdef ARCH_ALPHA
1625
    dsputil_init_alpha();
1626
#endif
1627
#ifdef ARCH_POWERPC
1628
    dsputil_init_ppc();
1629
#endif
1630
#ifdef HAVE_MMI
1631
    dsputil_init_mmi();
1632
#endif
1633

    
1634
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1635
}
1636

    
1637
/* remove any non bit exact operation (testing purpose) */
1638
void avcodec_set_bit_exact(void)
1639
{
1640
    ff_bit_exact=1;
1641
#ifdef HAVE_MMX
1642
    dsputil_set_bit_exact_mmx();
1643
#endif
1644
}
1645

    
1646
void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1647
              int orig_linesize[3], int coded_linesize,
1648
              AVCodecContext *avctx)
1649
{
1650
    int quad, diff, x, y;
1651
    UINT8 *orig, *coded;
1652
    UINT32 *sq = squareTbl + 256;
1653
    
1654
    quad = 0;
1655
    diff = 0;
1656
    
1657
    /* Luminance */
1658
    orig = orig_image[0];
1659
    coded = coded_image[0];
1660
    
1661
    for (y=0;y<avctx->height;y++) {
1662
        for (x=0;x<avctx->width;x++) {
1663
            diff = *(orig + x) - *(coded + x);
1664
            quad += sq[diff];
1665
        }
1666
        orig += orig_linesize[0];
1667
        coded += coded_linesize;
1668
    }
1669
   
1670
    avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1671
    
1672
    if (avctx->psnr_y) {
1673
        avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1674
        avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1675
    } else
1676
        avctx->psnr_y = 99.99;
1677
}
1678