Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 02da51ec

History | View | Annotate | Download (52 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23
#include "simple_idct.h"
24

    
25
void (*ff_idct)(DCTELEM *block);
26
void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27
void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28
void (*av_fdct)(DCTELEM *block);
29
void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30
void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31
void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32
void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33
void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34
void (*clear_blocks)(DCTELEM *blocks);
35

    
36
op_pixels_abs_func pix_abs16x16;
37
op_pixels_abs_func pix_abs16x16_x2;
38
op_pixels_abs_func pix_abs16x16_y2;
39
op_pixels_abs_func pix_abs16x16_xy2;
40

    
41
op_pixels_abs_func pix_abs8x8;
42
op_pixels_abs_func pix_abs8x8_x2;
43
op_pixels_abs_func pix_abs8x8_y2;
44
op_pixels_abs_func pix_abs8x8_xy2;
45

    
46
UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
47
UINT32 squareTbl[512];
48

    
49
extern INT16 default_intra_matrix[64];
50
extern INT16 default_non_intra_matrix[64];
51
extern INT16 ff_mpeg4_default_intra_matrix[64];
52
extern INT16 ff_mpeg4_default_non_intra_matrix[64];
53

    
54
UINT8 zigzag_direct[64] = {
55
    0, 1, 8, 16, 9, 2, 3, 10,
56
    17, 24, 32, 25, 18, 11, 4, 5,
57
    12, 19, 26, 33, 40, 48, 41, 34,
58
    27, 20, 13, 6, 7, 14, 21, 28,
59
    35, 42, 49, 56, 57, 50, 43, 36,
60
    29, 22, 15, 23, 30, 37, 44, 51,
61
    58, 59, 52, 45, 38, 31, 39, 46,
62
    53, 60, 61, 54, 47, 55, 62, 63
63
};
64

    
65
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66
UINT16 __align8 inv_zigzag_direct16[64];
67

    
68
/* not permutated zigzag_direct for MMX quantizer */
69
UINT8 zigzag_direct_noperm[64];
70

    
71
UINT8 ff_alternate_horizontal_scan[64] = {
72
    0,  1,  2,  3,  8,  9, 16, 17, 
73
    10, 11,  4,  5,  6,  7, 15, 14,
74
    13, 12, 19, 18, 24, 25, 32, 33, 
75
    26, 27, 20, 21, 22, 23, 28, 29,
76
    30, 31, 34, 35, 40, 41, 48, 49, 
77
    42, 43, 36, 37, 38, 39, 44, 45,
78
    46, 47, 50, 51, 56, 57, 58, 59, 
79
    52, 53, 54, 55, 60, 61, 62, 63,
80
};
81

    
82
UINT8 ff_alternate_vertical_scan[64] = {
83
    0,  8, 16, 24,  1,  9,  2, 10, 
84
    17, 25, 32, 40, 48, 56, 57, 49,
85
    41, 33, 26, 18,  3, 11,  4, 12, 
86
    19, 27, 34, 42, 50, 58, 35, 43,
87
    51, 59, 20, 28,  5, 13,  6, 14, 
88
    21, 29, 36, 44, 52, 60, 37, 45,
89
    53, 61, 22, 30,  7, 15, 23, 31, 
90
    38, 46, 54, 62, 39, 47, 55, 63,
91
};
92

    
93
#ifdef SIMPLE_IDCT
94

    
95
/* Input permutation for the simple_idct_mmx */
96
static UINT8 simple_mmx_permutation[64]={
97
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
98
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
99
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
100
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
101
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
102
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
103
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
104
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
105
};
106
#endif
107

    
108
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109
UINT32 inverse[256]={
110
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
111
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
112
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
113
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
114
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
115
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
116
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
117
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
118
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
119
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
120
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
121
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
122
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
123
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
124
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
125
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
126
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
127
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
128
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
129
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
130
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
131
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
132
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
133
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
134
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
135
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
136
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
137
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
138
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
139
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
140
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
141
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
142
};
143

    
144
/* used to skip zeros at the end */
145
UINT8 zigzag_end[64];
146

    
147
UINT8 permutation[64];
148
//UINT8 invPermutation[64];
149

    
150
static void build_zigzag_end(void)
151
{
152
    int lastIndex;
153
    int lastIndexAfterPerm=0;
154
    for(lastIndex=0; lastIndex<64; lastIndex++)
155
    {
156
        if(zigzag_direct[lastIndex] > lastIndexAfterPerm) 
157
            lastIndexAfterPerm= zigzag_direct[lastIndex];
158
        zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
159
    }
160
}
161

    
162
void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
163
{
164
    int i;
165

    
166
    /* read the pixels */
167
    for(i=0;i<8;i++) {
168
        block[0] = pixels[0];
169
        block[1] = pixels[1];
170
        block[2] = pixels[2];
171
        block[3] = pixels[3];
172
        block[4] = pixels[4];
173
        block[5] = pixels[5];
174
        block[6] = pixels[6];
175
        block[7] = pixels[7];
176
        pixels += line_size;
177
        block += 8;
178
    }
179
}
180

    
181
void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
182
                   int stride){
183
    int i;
184

    
185
    /* read the pixels */
186
    for(i=0;i<8;i++) {
187
        block[0] = s1[0] - s2[0];
188
        block[1] = s1[1] - s2[1];
189
        block[2] = s1[2] - s2[2];
190
        block[3] = s1[3] - s2[3];
191
        block[4] = s1[4] - s2[4];
192
        block[5] = s1[5] - s2[5];
193
        block[6] = s1[6] - s2[6];
194
        block[7] = s1[7] - s2[7];
195
        s1 += stride;
196
        s2 += stride;
197
        block += 8;
198
    }
199
}
200

    
201

    
202
void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
203
                          int line_size)
204
{
205
    int i;
206
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
207
    
208
    /* read the pixels */
209
    for(i=0;i<8;i++) {
210
        pixels[0] = cm[block[0]];
211
        pixels[1] = cm[block[1]];
212
        pixels[2] = cm[block[2]];
213
        pixels[3] = cm[block[3]];
214
        pixels[4] = cm[block[4]];
215
        pixels[5] = cm[block[5]];
216
        pixels[6] = cm[block[6]];
217
        pixels[7] = cm[block[7]];
218

    
219
        pixels += line_size;
220
        block += 8;
221
    }
222
}
223

    
224
void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
225
                          int line_size)
226
{
227
    int i;
228
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
229
    
230
    /* read the pixels */
231
    for(i=0;i<8;i++) {
232
        pixels[0] = cm[pixels[0] + block[0]];
233
        pixels[1] = cm[pixels[1] + block[1]];
234
        pixels[2] = cm[pixels[2] + block[2]];
235
        pixels[3] = cm[pixels[3] + block[3]];
236
        pixels[4] = cm[pixels[4] + block[4]];
237
        pixels[5] = cm[pixels[5] + block[5]];
238
        pixels[6] = cm[pixels[6] + block[6]];
239
        pixels[7] = cm[pixels[7] + block[7]];
240
        pixels += line_size;
241
        block += 8;
242
    }
243
}
244

    
245
#if 0
246

247
#define PIXOP2(OPNAME, OP) \
248
void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
249
{\
250
    int i;\
251
    for(i=0; i<h; i++){\
252
        OP(*((uint64_t*)block), LD64(pixels));\
253
        pixels+=line_size;\
254
        block +=line_size;\
255
    }\
256
}\
257
\
258
void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
259
{\
260
    int i;\
261
    for(i=0; i<h; i++){\
262
        const uint64_t a= LD64(pixels  );\
263
        const uint64_t b= LD64(pixels+1);\
264
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
265
        pixels+=line_size;\
266
        block +=line_size;\
267
    }\
268
}\
269
\
270
void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
271
{\
272
    int i;\
273
    for(i=0; i<h; i++){\
274
        const uint64_t a= LD64(pixels  );\
275
        const uint64_t b= LD64(pixels+1);\
276
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
277
        pixels+=line_size;\
278
        block +=line_size;\
279
    }\
280
}\
281
\
282
void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
283
{\
284
    int i;\
285
    for(i=0; i<h; i++){\
286
        const uint64_t a= LD64(pixels          );\
287
        const uint64_t b= LD64(pixels+line_size);\
288
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
289
        pixels+=line_size;\
290
        block +=line_size;\
291
    }\
292
}\
293
\
294
void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
295
{\
296
    int i;\
297
    for(i=0; i<h; i++){\
298
        const uint64_t a= LD64(pixels          );\
299
        const uint64_t b= LD64(pixels+line_size);\
300
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
301
        pixels+=line_size;\
302
        block +=line_size;\
303
    }\
304
}\
305
\
306
void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
307
{\
308
        int i;\
309
        const uint64_t a= LD64(pixels  );\
310
        const uint64_t b= LD64(pixels+1);\
311
        uint64_t l0=  (a&0x0303030303030303ULL)\
312
                    + (b&0x0303030303030303ULL)\
313
                    + 0x0202020202020202ULL;\
314
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
315
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
316
        uint64_t l1,h1;\
317
\
318
        pixels+=line_size;\
319
        for(i=0; i<h; i+=2){\
320
            uint64_t a= LD64(pixels  );\
321
            uint64_t b= LD64(pixels+1);\
322
            l1=  (a&0x0303030303030303ULL)\
323
               + (b&0x0303030303030303ULL);\
324
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
325
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
326
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
327
            pixels+=line_size;\
328
            block +=line_size;\
329
            a= LD64(pixels  );\
330
            b= LD64(pixels+1);\
331
            l0=  (a&0x0303030303030303ULL)\
332
               + (b&0x0303030303030303ULL)\
333
               + 0x0202020202020202ULL;\
334
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
337
            pixels+=line_size;\
338
            block +=line_size;\
339
        }\
340
}\
341
\
342
void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
343
{\
344
        int i;\
345
        const uint64_t a= LD64(pixels  );\
346
        const uint64_t b= LD64(pixels+1);\
347
        uint64_t l0=  (a&0x0303030303030303ULL)\
348
                    + (b&0x0303030303030303ULL)\
349
                    + 0x0101010101010101ULL;\
350
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
351
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
352
        uint64_t l1,h1;\
353
\
354
        pixels+=line_size;\
355
        for(i=0; i<h; i+=2){\
356
            uint64_t a= LD64(pixels  );\
357
            uint64_t b= LD64(pixels+1);\
358
            l1=  (a&0x0303030303030303ULL)\
359
               + (b&0x0303030303030303ULL);\
360
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
362
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
363
            pixels+=line_size;\
364
            block +=line_size;\
365
            a= LD64(pixels  );\
366
            b= LD64(pixels+1);\
367
            l0=  (a&0x0303030303030303ULL)\
368
               + (b&0x0303030303030303ULL)\
369
               + 0x0101010101010101ULL;\
370
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
373
            pixels+=line_size;\
374
            block +=line_size;\
375
        }\
376
}\
377
\
378
void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
379
    OPNAME ## _pixels,\
380
    OPNAME ## _pixels_x2,\
381
    OPNAME ## _pixels_y2,\
382
    OPNAME ## _pixels_xy2,\
383
};\
384
\
385
void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
386
    OPNAME ## _pixels,\
387
    OPNAME ## _no_rnd_pixels_x2,\
388
    OPNAME ## _no_rnd_pixels_y2,\
389
    OPNAME ## _no_rnd_pixels_xy2,\
390
};
391

392
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
393
#else // 64 bit variant
394

    
395
#define PIXOP2(OPNAME, OP) \
396
void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
397
{\
398
    int i;\
399
    for(i=0; i<h; i++){\
400
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
401
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
402
        pixels+=line_size;\
403
        block +=line_size;\
404
    }\
405
}\
406
\
407
void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
408
{\
409
    int i;\
410
    for(i=0; i<h; i++){\
411
        int j;\
412
        for(j=0; j<2; j++){\
413
            const uint32_t a= LD32(pixels  );\
414
            const uint32_t b= LD32(pixels+1);\
415
            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
416
            pixels+=4;\
417
            block +=4;\
418
        }\
419
        pixels+=line_size-8;\
420
        block +=line_size-8;\
421
    }\
422
}\
423
\
424
void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
425
{\
426
    int i;\
427
    for(i=0; i<h; i++){\
428
        int j;\
429
        for(j=0; j<2; j++){\
430
            const uint32_t a= LD32(pixels  );\
431
            const uint32_t b= LD32(pixels+1);\
432
            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
433
            pixels+=4;\
434
            block +=4;\
435
        }\
436
        pixels+=line_size-8;\
437
        block +=line_size-8;\
438
    }\
439
}\
440
\
441
void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
442
{\
443
    int i;\
444
    for(i=0; i<h; i++){\
445
        int j;\
446
        for(j=0; j<2; j++){\
447
            const uint32_t a= LD32(pixels          );\
448
            const uint32_t b= LD32(pixels+line_size);\
449
            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
450
            pixels+=4;\
451
            block +=4;\
452
        }\
453
        pixels+=line_size-8;\
454
        block +=line_size-8;\
455
    }\
456
}\
457
\
458
void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
459
{\
460
    int i;\
461
    for(i=0; i<h; i++){\
462
        int j;\
463
        for(j=0; j<2; j++){\
464
            const uint32_t a= LD32(pixels          );\
465
            const uint32_t b= LD32(pixels+line_size);\
466
            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
467
            pixels+=4;\
468
            block +=4;\
469
        }\
470
        pixels+=line_size-8;\
471
        block +=line_size-8;\
472
    }\
473
}\
474
\
475
void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
476
{\
477
    int j;\
478
    for(j=0; j<2; j++){\
479
        int i;\
480
        const uint32_t a= LD32(pixels  );\
481
        const uint32_t b= LD32(pixels+1);\
482
        uint32_t l0=  (a&0x03030303UL)\
483
                    + (b&0x03030303UL)\
484
                    + 0x02020202UL;\
485
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
486
                   + ((b&0xFCFCFCFCUL)>>2);\
487
        uint32_t l1,h1;\
488
\
489
        pixels+=line_size;\
490
        for(i=0; i<h; i+=2){\
491
            uint32_t a= LD32(pixels  );\
492
            uint32_t b= LD32(pixels+1);\
493
            l1=  (a&0x03030303UL)\
494
               + (b&0x03030303UL);\
495
            h1= ((a&0xFCFCFCFCUL)>>2)\
496
              + ((b&0xFCFCFCFCUL)>>2);\
497
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
498
            pixels+=line_size;\
499
            block +=line_size;\
500
            a= LD32(pixels  );\
501
            b= LD32(pixels+1);\
502
            l0=  (a&0x03030303UL)\
503
               + (b&0x03030303UL)\
504
               + 0x02020202UL;\
505
            h0= ((a&0xFCFCFCFCUL)>>2)\
506
              + ((b&0xFCFCFCFCUL)>>2);\
507
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
508
            pixels+=line_size;\
509
            block +=line_size;\
510
        }\
511
        pixels+=4-line_size*(h+1);\
512
        block +=4-line_size*h;\
513
    }\
514
}\
515
\
516
void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
517
{\
518
    int j;\
519
    for(j=0; j<2; j++){\
520
        int i;\
521
        const uint32_t a= LD32(pixels  );\
522
        const uint32_t b= LD32(pixels+1);\
523
        uint32_t l0=  (a&0x03030303UL)\
524
                    + (b&0x03030303UL)\
525
                    + 0x01010101UL;\
526
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
527
                   + ((b&0xFCFCFCFCUL)>>2);\
528
        uint32_t l1,h1;\
529
\
530
        pixels+=line_size;\
531
        for(i=0; i<h; i+=2){\
532
            uint32_t a= LD32(pixels  );\
533
            uint32_t b= LD32(pixels+1);\
534
            l1=  (a&0x03030303UL)\
535
               + (b&0x03030303UL);\
536
            h1= ((a&0xFCFCFCFCUL)>>2)\
537
              + ((b&0xFCFCFCFCUL)>>2);\
538
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
539
            pixels+=line_size;\
540
            block +=line_size;\
541
            a= LD32(pixels  );\
542
            b= LD32(pixels+1);\
543
            l0=  (a&0x03030303UL)\
544
               + (b&0x03030303UL)\
545
               + 0x01010101UL;\
546
            h0= ((a&0xFCFCFCFCUL)>>2)\
547
              + ((b&0xFCFCFCFCUL)>>2);\
548
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
549
            pixels+=line_size;\
550
            block +=line_size;\
551
        }\
552
        pixels+=4-line_size*(h+1);\
553
        block +=4-line_size*h;\
554
    }\
555
}\
556
\
557
void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
558
    OPNAME ## _pixels,\
559
    OPNAME ## _pixels_x2,\
560
    OPNAME ## _pixels_y2,\
561
    OPNAME ## _pixels_xy2,\
562
};\
563
\
564
void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
565
    OPNAME ## _pixels,\
566
    OPNAME ## _no_rnd_pixels_x2,\
567
    OPNAME ## _no_rnd_pixels_y2,\
568
    OPNAME ## _no_rnd_pixels_xy2,\
569
};
570
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
571
#endif
572

    
573
#define op_put(a, b) a = b
574

    
575
PIXOP2(avg, op_avg)
576
PIXOP2(put, op_put)
577
#undef op_avg
578
#undef op_put
579

    
580
#if 0
581
/* FIXME this stuff could be removed as its ot really used anymore */
582
#define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
583
                                                                                         \
584
static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
585
{                                                                                        \
586
    BTYPE *p;                                                                            \
587
    const UINT8 *pix;                                                                    \
588
                                                                                         \
589
    p = block;                                                                           \
590
    pix = pixels;                                                                        \
591
    do {                                                                                 \
592
        OP(p[0], pix[0]);                                                                  \
593
        OP(p[1], pix[1]);                                                                  \
594
        OP(p[2], pix[2]);                                                                  \
595
        OP(p[3], pix[3]);                                                                  \
596
        OP(p[4], pix[4]);                                                                  \
597
        OP(p[5], pix[5]);                                                                  \
598
        OP(p[6], pix[6]);                                                                  \
599
        OP(p[7], pix[7]);                                                                  \
600
        pix += line_size;                                                                \
601
        p += INCR;                                                                       \
602
    } while (--h);;                                                                       \
603
}                                                                                        \
604
                                                                                         \
605
static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
606
{                                                                                        \
607
    BTYPE *p;                                                                          \
608
    const UINT8 *pix;                                                                    \
609
                                                                                         \
610
    p = block;                                                                           \
611
    pix = pixels;                                                                        \
612
    do {                                                                   \
613
        OP(p[0], avg2(pix[0], pix[1]));                                                    \
614
        OP(p[1], avg2(pix[1], pix[2]));                                                    \
615
        OP(p[2], avg2(pix[2], pix[3]));                                                    \
616
        OP(p[3], avg2(pix[3], pix[4]));                                                    \
617
        OP(p[4], avg2(pix[4], pix[5]));                                                    \
618
        OP(p[5], avg2(pix[5], pix[6]));                                                    \
619
        OP(p[6], avg2(pix[6], pix[7]));                                                    \
620
        OP(p[7], avg2(pix[7], pix[8]));                                                    \
621
        pix += line_size;                                                                \
622
        p += INCR;                                                                       \
623
    } while (--h);                                                                        \
624
}                                                                                        \
625
                                                                                         \
626
static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
627
{                                                                                        \
628
    BTYPE *p;                                                                          \
629
    const UINT8 *pix;                                                                    \
630
    const UINT8 *pix1;                                                                   \
631
                                                                                         \
632
    p = block;                                                                           \
633
    pix = pixels;                                                                        \
634
    pix1 = pixels + line_size;                                                           \
635
    do {                                                                                 \
636
        OP(p[0], avg2(pix[0], pix1[0]));                                                   \
637
        OP(p[1], avg2(pix[1], pix1[1]));                                                   \
638
        OP(p[2], avg2(pix[2], pix1[2]));                                                   \
639
        OP(p[3], avg2(pix[3], pix1[3]));                                                   \
640
        OP(p[4], avg2(pix[4], pix1[4]));                                                   \
641
        OP(p[5], avg2(pix[5], pix1[5]));                                                   \
642
        OP(p[6], avg2(pix[6], pix1[6]));                                                   \
643
        OP(p[7], avg2(pix[7], pix1[7]));                                                   \
644
        pix += line_size;                                                                \
645
        pix1 += line_size;                                                               \
646
        p += INCR;                                                                       \
647
    } while(--h);                                                                         \
648
}                                                                                        \
649
                                                                                         \
650
static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
651
{                                                                                        \
652
    BTYPE *p;                                                                          \
653
    const UINT8 *pix;                                                                    \
654
    const UINT8 *pix1;                                                                   \
655
                                                                                         \
656
    p = block;                                                                           \
657
    pix = pixels;                                                                        \
658
    pix1 = pixels + line_size;                                                           \
659
    do {                                                                   \
660
        OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
661
        OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
662
        OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
663
        OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
664
        OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
665
        OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
666
        OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
667
        OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
668
        pix += line_size;                                                                \
669
        pix1 += line_size;                                                               \
670
        p += INCR;                                                                       \
671
    } while(--h);                                                                         \
672
}                                                                                        \
673
                                                                                         \
674
void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
675
    OPNAME ## _pixels,                                                                   \
676
    OPNAME ## _pixels_x2,                                                                \
677
    OPNAME ## _pixels_y2,                                                                \
678
    OPNAME ## _pixels_xy2,                                                               \
679
};
680

681
/* rounding primitives */
682
#define avg2(a,b) ((a+b+1)>>1)
683
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
684

685
#define op_avg(a, b) a = avg2(a, b)
686
#define op_sub(a, b) a -= b
687

688
PIXOP(DCTELEM, sub, op_sub, 8)
689

690
/* not rounding primitives */
691
#undef avg2
692
#undef avg4
693
#define avg2(a,b) ((a+b)>>1)
694
#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
695

696
/* motion estimation */
697

698
#undef avg2
699
#undef avg4
700
#endif
701

    
702
#define avg2(a,b) ((a+b+1)>>1)
703
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
704

    
705
static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
706
{
707
    const int A=(16-x16)*(16-y16);
708
    const int B=(   x16)*(16-y16);
709
    const int C=(16-x16)*(   y16);
710
    const int D=(   x16)*(   y16);
711
    int i;
712
    rounder= 128 - rounder;
713

    
714
    for(i=0; i<h; i++)
715
    {
716
        dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
717
        dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
718
        dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
719
        dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
720
        dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
721
        dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
722
        dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
723
        dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
724
        dst+= srcStride;
725
        src+= srcStride;
726
    }
727
}
728

    
729
static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
730
{
731
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
732
    int i;
733
    for(i=0; i<h; i++)
734
    {
735
        dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
736
        dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
737
        dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
738
        dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
739
        dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
740
        dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
741
        dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
742
        dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
743
        dst+=dstStride;
744
        src+=srcStride;
745
    }
746
}
747

    
748
static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
749
{
750
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
751
    int i;
752
    for(i=0; i<w; i++)
753
    {
754
        const int src0= src[0*srcStride];
755
        const int src1= src[1*srcStride];
756
        const int src2= src[2*srcStride];
757
        const int src3= src[3*srcStride];
758
        const int src4= src[4*srcStride];
759
        const int src5= src[5*srcStride];
760
        const int src6= src[6*srcStride];
761
        const int src7= src[7*srcStride];
762
        const int src8= src[8*srcStride];
763
        dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
764
        dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
765
        dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
766
        dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
767
        dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
768
        dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
769
        dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
770
        dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
771
        dst++;
772
        src++;
773
    }
774
}
775

    
776
static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
777
{
778
    int i;
779
    for(i=0; i<8; i++)
780
    {
781
        dst[0]= src[0];
782
        dst[1]= src[1];
783
        dst[2]= src[2];
784
        dst[3]= src[3];
785
        dst[4]= src[4];
786
        dst[5]= src[5];
787
        dst[6]= src[6];
788
        dst[7]= src[7];
789
        dst+=dstStride;
790
        src+=srcStride;
791
    }
792
}
793

    
794
static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
795
{
796
    int i;
797
    for(i=0; i<8; i++)
798
    {
799
        dst[0]= (src1[0] + src2[0] + r)>>1;
800
        dst[1]= (src1[1] + src2[1] + r)>>1;
801
        dst[2]= (src1[2] + src2[2] + r)>>1;
802
        dst[3]= (src1[3] + src2[3] + r)>>1;
803
        dst[4]= (src1[4] + src2[4] + r)>>1;
804
        dst[5]= (src1[5] + src2[5] + r)>>1;
805
        dst[6]= (src1[6] + src2[6] + r)>>1;
806
        dst[7]= (src1[7] + src2[7] + r)>>1;
807
        dst+=dstStride;
808
        src1+=srcStride;
809
        src2+=8;
810
    }
811
}
812

    
813
static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
814
{
815
    int i;
816
    for(i=0; i<8; i++)
817
    {
818
        dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
819
        dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
820
        dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
821
        dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
822
        dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
823
        dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
824
        dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
825
        dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
826
        dst+=dstStride;
827
        src1+=srcStride;
828
        src2+=8;
829
        src3+=8;
830
        src4+=8;
831
    }
832
}
833

    
834
#define QPEL_MC(r, name) \
835
static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
836
{\
837
    put_block(dst, src, dstStride, srcStride);\
838
}\
839
\
840
static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
841
{\
842
    UINT8 half[64];\
843
    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
844
    avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
845
}\
846
\
847
static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
848
{\
849
    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
850
}\
851
\
852
static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
853
{\
854
    UINT8 half[64];\
855
    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
856
    avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
857
}\
858
\
859
static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
860
{\
861
    UINT8 half[64];\
862
    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
863
    avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
864
}\
865
\
866
static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
867
{\
868
    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
869
}\
870
\
871
static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
872
{\
873
    UINT8 half[64];\
874
    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
875
    avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
876
}\
877
static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
878
{\
879
    UINT8 halfH[72];\
880
    UINT8 halfV[64];\
881
    UINT8 halfHV[64];\
882
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
883
    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
884
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
885
    avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
886
}\
887
static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
888
{\
889
    UINT8 halfH[72];\
890
    UINT8 halfV[64];\
891
    UINT8 halfHV[64];\
892
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
893
    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
894
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
895
    avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
896
}\
897
static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
898
{\
899
    UINT8 halfH[72];\
900
    UINT8 halfV[64];\
901
    UINT8 halfHV[64];\
902
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
903
    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
904
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
905
    avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
906
}\
907
static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
908
{\
909
    UINT8 halfH[72];\
910
    UINT8 halfV[64];\
911
    UINT8 halfHV[64];\
912
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
913
    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
914
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
915
    avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
916
}\
917
static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
918
{\
919
    UINT8 halfH[72];\
920
    UINT8 halfHV[64];\
921
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
922
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
923
    avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
924
}\
925
static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
926
{\
927
    UINT8 halfH[72];\
928
    UINT8 halfHV[64];\
929
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
930
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
931
    avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
932
}\
933
static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
934
{\
935
    UINT8 halfH[72];\
936
    UINT8 halfV[64];\
937
    UINT8 halfHV[64];\
938
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
939
    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
940
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
941
    avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
942
}\
943
static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
944
{\
945
    UINT8 halfH[72];\
946
    UINT8 halfV[64];\
947
    UINT8 halfHV[64];\
948
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
949
    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
950
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
951
    avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
952
}\
953
static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
954
{\
955
    UINT8 halfH[72];\
956
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
957
    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
958
}\
959
qpel_mc_func qpel_mc ## name ## _tab[16]={ \
960
    qpel_mc00_c ## name,                                                                   \
961
    qpel_mc10_c ## name,                                                                   \
962
    qpel_mc20_c ## name,                                                                   \
963
    qpel_mc30_c ## name,                                                                   \
964
    qpel_mc01_c ## name,                                                                   \
965
    qpel_mc11_c ## name,                                                                   \
966
    qpel_mc21_c ## name,                                                                   \
967
    qpel_mc31_c ## name,                                                                   \
968
    qpel_mc02_c ## name,                                                                   \
969
    qpel_mc12_c ## name,                                                                   \
970
    qpel_mc22_c ## name,                                                                   \
971
    qpel_mc32_c ## name,                                                                   \
972
    qpel_mc03_c ## name,                                                                   \
973
    qpel_mc13_c ## name,                                                                   \
974
    qpel_mc23_c ## name,                                                                   \
975
    qpel_mc33_c ## name,                                                                   \
976
};
977

    
978
QPEL_MC(0, _rnd)
979
QPEL_MC(1, _no_rnd)
980

    
981
int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
982
{
983
    int s, i;
984

    
985
    s = 0;
986
    for(i=0;i<16;i++) {
987
        s += abs(pix1[0] - pix2[0]);
988
        s += abs(pix1[1] - pix2[1]);
989
        s += abs(pix1[2] - pix2[2]);
990
        s += abs(pix1[3] - pix2[3]);
991
        s += abs(pix1[4] - pix2[4]);
992
        s += abs(pix1[5] - pix2[5]);
993
        s += abs(pix1[6] - pix2[6]);
994
        s += abs(pix1[7] - pix2[7]);
995
        s += abs(pix1[8] - pix2[8]);
996
        s += abs(pix1[9] - pix2[9]);
997
        s += abs(pix1[10] - pix2[10]);
998
        s += abs(pix1[11] - pix2[11]);
999
        s += abs(pix1[12] - pix2[12]);
1000
        s += abs(pix1[13] - pix2[13]);
1001
        s += abs(pix1[14] - pix2[14]);
1002
        s += abs(pix1[15] - pix2[15]);
1003
        pix1 += line_size;
1004
        pix2 += line_size;
1005
    }
1006
    return s;
1007
}
1008

    
1009
int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1010
{
1011
    int s, i;
1012

    
1013
    s = 0;
1014
    for(i=0;i<16;i++) {
1015
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1016
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1017
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1018
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1019
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1020
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1021
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1022
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1023
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1024
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1025
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1026
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1027
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1028
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1029
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1030
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1031
        pix1 += line_size;
1032
        pix2 += line_size;
1033
    }
1034
    return s;
1035
}
1036

    
1037
int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1038
{
1039
    int s, i;
1040
    UINT8 *pix3 = pix2 + line_size;
1041

    
1042
    s = 0;
1043
    for(i=0;i<16;i++) {
1044
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1045
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1046
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1047
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1048
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1049
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1050
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1051
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1052
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1053
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1054
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1055
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1056
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1057
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1058
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1059
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1060
        pix1 += line_size;
1061
        pix2 += line_size;
1062
        pix3 += line_size;
1063
    }
1064
    return s;
1065
}
1066

    
1067
int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1068
{
1069
    int s, i;
1070
    UINT8 *pix3 = pix2 + line_size;
1071

    
1072
    s = 0;
1073
    for(i=0;i<16;i++) {
1074
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1075
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1076
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1077
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1078
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1079
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1080
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1081
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1082
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1083
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1084
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1085
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1086
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1087
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1088
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1089
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1090
        pix1 += line_size;
1091
        pix2 += line_size;
1092
        pix3 += line_size;
1093
    }
1094
    return s;
1095
}
1096

    
1097
int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1098
{
1099
    int s, i;
1100

    
1101
    s = 0;
1102
    for(i=0;i<8;i++) {
1103
        s += abs(pix1[0] - pix2[0]);
1104
        s += abs(pix1[1] - pix2[1]);
1105
        s += abs(pix1[2] - pix2[2]);
1106
        s += abs(pix1[3] - pix2[3]);
1107
        s += abs(pix1[4] - pix2[4]);
1108
        s += abs(pix1[5] - pix2[5]);
1109
        s += abs(pix1[6] - pix2[6]);
1110
        s += abs(pix1[7] - pix2[7]);
1111
        pix1 += line_size;
1112
        pix2 += line_size;
1113
    }
1114
    return s;
1115
}
1116

    
1117
int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1118
{
1119
    int s, i;
1120

    
1121
    s = 0;
1122
    for(i=0;i<8;i++) {
1123
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1124
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1125
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1126
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1127
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1128
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1129
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1130
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1131
        pix1 += line_size;
1132
        pix2 += line_size;
1133
    }
1134
    return s;
1135
}
1136

    
1137
int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1138
{
1139
    int s, i;
1140
    UINT8 *pix3 = pix2 + line_size;
1141

    
1142
    s = 0;
1143
    for(i=0;i<8;i++) {
1144
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1145
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1146
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1147
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1148
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1149
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1150
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1151
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1152
        pix1 += line_size;
1153
        pix2 += line_size;
1154
        pix3 += line_size;
1155
    }
1156
    return s;
1157
}
1158

    
1159
int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1160
{
1161
    int s, i;
1162
    UINT8 *pix3 = pix2 + line_size;
1163

    
1164
    s = 0;
1165
    for(i=0;i<8;i++) {
1166
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1167
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1168
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1169
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1170
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1171
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1172
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1173
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1174
        pix1 += line_size;
1175
        pix2 += line_size;
1176
        pix3 += line_size;
1177
    }
1178
    return s;
1179
}
1180

    
1181
/* permute block according so that it corresponds to the MMX idct
1182
   order */
1183
#ifdef SIMPLE_IDCT
1184
 /* general permutation, but perhaps slightly slower */
1185
void block_permute(INT16 *block)
1186
{
1187
        int i;
1188
        INT16 temp[64];
1189

    
1190
        for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1191

    
1192
        for(i=0; i<64; i++) block[i] = temp[i];
1193
}
1194
#else
1195

    
1196
void block_permute(INT16 *block)
1197
{
1198
    int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1199
    int i;
1200

    
1201
    for(i=0;i<8;i++) {
1202
        tmp1 = block[1];
1203
        tmp2 = block[2];
1204
        tmp3 = block[3];
1205
        tmp4 = block[4];
1206
        tmp5 = block[5];
1207
        tmp6 = block[6];
1208
        block[1] = tmp2;
1209
        block[2] = tmp4;
1210
        block[3] = tmp6;
1211
        block[4] = tmp1;
1212
        block[5] = tmp3;
1213
        block[6] = tmp5;
1214
        block += 8;
1215
    }
1216
}
1217
#endif
1218

    
1219
void clear_blocks_c(DCTELEM *blocks)
1220
{
1221
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1222
}
1223

    
1224
/* XXX: those functions should be suppressed ASAP when all IDCTs are
1225
   converted */
1226
void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1227
{
1228
    ff_idct (block);
1229
    put_pixels_clamped(block, dest, line_size);
1230
}
1231

    
1232
void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1233
{
1234
    ff_idct (block);
1235
    add_pixels_clamped(block, dest, line_size);
1236
}
1237

    
1238
void dsputil_init(void)
1239
{
1240
    int i, j;
1241
    int use_permuted_idct;
1242

    
1243
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1244
    for(i=0;i<MAX_NEG_CROP;i++) {
1245
        cropTbl[i] = 0;
1246
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
1247
    }
1248

    
1249
    for(i=0;i<512;i++) {
1250
        squareTbl[i] = (i - 256) * (i - 256);
1251
    }
1252

    
1253
#ifdef SIMPLE_IDCT
1254
    ff_idct = NULL;
1255
#else
1256
    ff_idct = j_rev_dct;
1257
#endif
1258
    get_pixels = get_pixels_c;
1259
    diff_pixels = diff_pixels_c;
1260
    put_pixels_clamped = put_pixels_clamped_c;
1261
    add_pixels_clamped = add_pixels_clamped_c;
1262
    gmc1= gmc1_c;
1263
    clear_blocks= clear_blocks_c;
1264

    
1265
    pix_abs16x16     = pix_abs16x16_c;
1266
    pix_abs16x16_x2  = pix_abs16x16_x2_c;
1267
    pix_abs16x16_y2  = pix_abs16x16_y2_c;
1268
    pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1269
    pix_abs8x8     = pix_abs8x8_c;
1270
    pix_abs8x8_x2  = pix_abs8x8_x2_c;
1271
    pix_abs8x8_y2  = pix_abs8x8_y2_c;
1272
    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1273
    av_fdct = fdct_ifast;
1274

    
1275
    use_permuted_idct = 1;
1276

    
1277
#ifdef HAVE_MMX
1278
    dsputil_init_mmx();
1279
#endif
1280
#ifdef ARCH_ARMV4L
1281
    dsputil_init_armv4l();
1282
#endif
1283
#ifdef HAVE_MLIB
1284
    dsputil_init_mlib();
1285
    use_permuted_idct = 0;
1286
#endif
1287
#ifdef ARCH_ALPHA
1288
    dsputil_init_alpha();
1289
    use_permuted_idct = 0;
1290
#endif
1291

    
1292
#ifdef SIMPLE_IDCT
1293
    if (ff_idct == NULL) {
1294
        ff_idct_put = simple_idct_put;
1295
        ff_idct_add = simple_idct_add;
1296
        use_permuted_idct=0;
1297
    } else {
1298
        ff_idct_put = gen_idct_put;
1299
        ff_idct_add = gen_idct_add;
1300
    }
1301
#endif
1302

    
1303
    if(use_permuted_idct)
1304
#ifdef SIMPLE_IDCT
1305
        for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1306
#else
1307
        for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1308
#endif
1309
    else
1310
        for(i=0; i<64; i++) permutation[i]=i;
1311

    
1312
    for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1313
    for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1314
    
1315
    if (use_permuted_idct) {
1316
        /* permute for IDCT */
1317
        for(i=0;i<64;i++) {
1318
            j = zigzag_direct[i];
1319
            zigzag_direct[i] = block_permute_op(j);
1320
            j = ff_alternate_horizontal_scan[i];
1321
            ff_alternate_horizontal_scan[i] = block_permute_op(j);
1322
            j = ff_alternate_vertical_scan[i];
1323
            ff_alternate_vertical_scan[i] = block_permute_op(j);
1324
        }
1325
        block_permute(default_intra_matrix);
1326
        block_permute(default_non_intra_matrix);
1327
        block_permute(ff_mpeg4_default_intra_matrix);
1328
        block_permute(ff_mpeg4_default_non_intra_matrix);
1329
    }
1330
    
1331
    build_zigzag_end();
1332
}
1333

    
1334
/* remove any non bit exact operation (testing purpose) */
1335
void avcodec_set_bit_exact(void)
1336
{
1337
#ifdef HAVE_MMX
1338
    dsputil_set_bit_exact_mmx();
1339
#endif
1340
}
1341

    
1342
void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1343
              int orig_linesize[3], int coded_linesize,
1344
              AVCodecContext *avctx)
1345
{
1346
    int quad, diff, x, y;
1347
    UINT8 *orig, *coded;
1348
    UINT32 *sq = squareTbl + 256;
1349
    
1350
    quad = 0;
1351
    diff = 0;
1352
    
1353
    /* Luminance */
1354
    orig = orig_image[0];
1355
    coded = coded_image[0];
1356
    
1357
    for (y=0;y<avctx->height;y++) {
1358
        for (x=0;x<avctx->width;x++) {
1359
            diff = *(orig + x) - *(coded + x);
1360
            quad += sq[diff];
1361
        }
1362
        orig += orig_linesize[0];
1363
        coded += coded_linesize;
1364
    }
1365
   
1366
    avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1367
    
1368
    if (avctx->psnr_y) {
1369
        avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1370
        avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1371
    } else
1372
        avctx->psnr_y = 99.99;
1373
}
1374