Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ be7109c1

History | View | Annotate | Download (53.2 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
#include "avcodec.h"
22
#include "dsputil.h"
23
#include "simple_idct.h"
24

    
25
void (*ff_idct)(DCTELEM *block);
26
void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27
void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28
void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
29
void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
30
void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
31
void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32
void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
33
void (*clear_blocks)(DCTELEM *blocks);
34
int (*pix_sum)(UINT8 * pix, int line_size);
35
int (*pix_norm1)(UINT8 * pix, int line_size);
36

    
37
op_pixels_abs_func pix_abs16x16;
38
op_pixels_abs_func pix_abs16x16_x2;
39
op_pixels_abs_func pix_abs16x16_y2;
40
op_pixels_abs_func pix_abs16x16_xy2;
41

    
42
op_pixels_abs_func pix_abs8x8;
43
op_pixels_abs_func pix_abs8x8_x2;
44
op_pixels_abs_func pix_abs8x8_y2;
45
op_pixels_abs_func pix_abs8x8_xy2;
46

    
47
UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
48
UINT32 squareTbl[512];
49

    
50
extern INT16 ff_mpeg1_default_intra_matrix[64];
51
extern INT16 ff_mpeg1_default_non_intra_matrix[64];
52
extern INT16 ff_mpeg4_default_intra_matrix[64];
53
extern INT16 ff_mpeg4_default_non_intra_matrix[64];
54

    
55
UINT8 zigzag_direct[64] = {
56
    0, 1, 8, 16, 9, 2, 3, 10,
57
    17, 24, 32, 25, 18, 11, 4, 5,
58
    12, 19, 26, 33, 40, 48, 41, 34,
59
    27, 20, 13, 6, 7, 14, 21, 28,
60
    35, 42, 49, 56, 57, 50, 43, 36,
61
    29, 22, 15, 23, 30, 37, 44, 51,
62
    58, 59, 52, 45, 38, 31, 39, 46,
63
    53, 60, 61, 54, 47, 55, 62, 63
64
};
65

    
66
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
67
UINT16 __align8 inv_zigzag_direct16[64];
68

    
69
/* not permutated zigzag_direct for MMX quantizer */
70
UINT8 zigzag_direct_noperm[64];
71

    
72
UINT8 ff_alternate_horizontal_scan[64] = {
73
    0,  1,  2,  3,  8,  9, 16, 17, 
74
    10, 11,  4,  5,  6,  7, 15, 14,
75
    13, 12, 19, 18, 24, 25, 32, 33, 
76
    26, 27, 20, 21, 22, 23, 28, 29,
77
    30, 31, 34, 35, 40, 41, 48, 49, 
78
    42, 43, 36, 37, 38, 39, 44, 45,
79
    46, 47, 50, 51, 56, 57, 58, 59, 
80
    52, 53, 54, 55, 60, 61, 62, 63,
81
};
82

    
83
UINT8 ff_alternate_vertical_scan[64] = {
84
    0,  8, 16, 24,  1,  9,  2, 10, 
85
    17, 25, 32, 40, 48, 56, 57, 49,
86
    41, 33, 26, 18,  3, 11,  4, 12, 
87
    19, 27, 34, 42, 50, 58, 35, 43,
88
    51, 59, 20, 28,  5, 13,  6, 14, 
89
    21, 29, 36, 44, 52, 60, 37, 45,
90
    53, 61, 22, 30,  7, 15, 23, 31, 
91
    38, 46, 54, 62, 39, 47, 55, 63,
92
};
93

    
94
#ifdef SIMPLE_IDCT
95

    
96
/* Input permutation for the simple_idct_mmx */
97
static UINT8 simple_mmx_permutation[64]={
98
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
99
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
100
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
101
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
102
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
103
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
104
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
105
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
106
};
107
#endif
108

    
109
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
110
UINT32 inverse[256]={
111
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
112
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
113
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
114
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
115
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
116
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
117
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
118
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
119
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
120
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
121
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
122
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
123
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
124
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
125
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
126
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
127
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
128
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
129
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
130
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
131
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
132
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
133
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
134
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
135
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
136
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
137
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
138
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
139
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
140
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
141
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
142
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
143
};
144

    
145
/* used to skip zeros at the end */
146
UINT8 zigzag_end[64];
147

    
148
UINT8 permutation[64];
149
//UINT8 invPermutation[64];
150

    
151
static void build_zigzag_end(void)
152
{
153
    int lastIndex;
154
    int lastIndexAfterPerm=0;
155
    for(lastIndex=0; lastIndex<64; lastIndex++)
156
    {
157
        if(zigzag_direct[lastIndex] > lastIndexAfterPerm) 
158
            lastIndexAfterPerm= zigzag_direct[lastIndex];
159
        zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
160
    }
161
}
162

    
163
int pix_sum_c(UINT8 * pix, int line_size)
164
{
165
    int s, i, j;
166

    
167
    s = 0;
168
    for (i = 0; i < 16; i++) {
169
        for (j = 0; j < 16; j += 8) {
170
            s += pix[0];
171
            s += pix[1];
172
            s += pix[2];
173
            s += pix[3];
174
            s += pix[4];
175
            s += pix[5];
176
            s += pix[6];
177
            s += pix[7];
178
            pix += 8;
179
        }
180
        pix += line_size - 16;
181
    }
182
    return s;
183
}
184

    
185
int pix_norm1_c(UINT8 * pix, int line_size)
186
{
187
    int s, i, j;
188
    UINT32 *sq = squareTbl + 256;
189

    
190
    s = 0;
191
    for (i = 0; i < 16; i++) {
192
        for (j = 0; j < 16; j += 8) {
193
            s += sq[pix[0]];
194
            s += sq[pix[1]];
195
            s += sq[pix[2]];
196
            s += sq[pix[3]];
197
            s += sq[pix[4]];
198
            s += sq[pix[5]];
199
            s += sq[pix[6]];
200
            s += sq[pix[7]];
201
            pix += 8;
202
        }
203
        pix += line_size - 16;
204
    }
205
    return s;
206
}
207

    
208

    
209
void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
210
{
211
    int i;
212

    
213
    /* read the pixels */
214
    for(i=0;i<8;i++) {
215
        block[0] = pixels[0];
216
        block[1] = pixels[1];
217
        block[2] = pixels[2];
218
        block[3] = pixels[3];
219
        block[4] = pixels[4];
220
        block[5] = pixels[5];
221
        block[6] = pixels[6];
222
        block[7] = pixels[7];
223
        pixels += line_size;
224
        block += 8;
225
    }
226
}
227

    
228
void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
229
                   int stride){
230
    int i;
231

    
232
    /* read the pixels */
233
    for(i=0;i<8;i++) {
234
        block[0] = s1[0] - s2[0];
235
        block[1] = s1[1] - s2[1];
236
        block[2] = s1[2] - s2[2];
237
        block[3] = s1[3] - s2[3];
238
        block[4] = s1[4] - s2[4];
239
        block[5] = s1[5] - s2[5];
240
        block[6] = s1[6] - s2[6];
241
        block[7] = s1[7] - s2[7];
242
        s1 += stride;
243
        s2 += stride;
244
        block += 8;
245
    }
246
}
247

    
248

    
249
void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
250
                          int line_size)
251
{
252
    int i;
253
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
254
    
255
    /* read the pixels */
256
    for(i=0;i<8;i++) {
257
        pixels[0] = cm[block[0]];
258
        pixels[1] = cm[block[1]];
259
        pixels[2] = cm[block[2]];
260
        pixels[3] = cm[block[3]];
261
        pixels[4] = cm[block[4]];
262
        pixels[5] = cm[block[5]];
263
        pixels[6] = cm[block[6]];
264
        pixels[7] = cm[block[7]];
265

    
266
        pixels += line_size;
267
        block += 8;
268
    }
269
}
270

    
271
void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
272
                          int line_size)
273
{
274
    int i;
275
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
276
    
277
    /* read the pixels */
278
    for(i=0;i<8;i++) {
279
        pixels[0] = cm[pixels[0] + block[0]];
280
        pixels[1] = cm[pixels[1] + block[1]];
281
        pixels[2] = cm[pixels[2] + block[2]];
282
        pixels[3] = cm[pixels[3] + block[3]];
283
        pixels[4] = cm[pixels[4] + block[4]];
284
        pixels[5] = cm[pixels[5] + block[5]];
285
        pixels[6] = cm[pixels[6] + block[6]];
286
        pixels[7] = cm[pixels[7] + block[7]];
287
        pixels += line_size;
288
        block += 8;
289
    }
290
}
291
#if 0
292

293
#define PIXOP2(OPNAME, OP) \
294
void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
295
{\
296
    int i;\
297
    for(i=0; i<h; i++){\
298
        OP(*((uint64_t*)block), LD64(pixels));\
299
        pixels+=line_size;\
300
        block +=line_size;\
301
    }\
302
}\
303
\
304
void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
305
{\
306
    int i;\
307
    for(i=0; i<h; i++){\
308
        const uint64_t a= LD64(pixels  );\
309
        const uint64_t b= LD64(pixels+1);\
310
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
311
        pixels+=line_size;\
312
        block +=line_size;\
313
    }\
314
}\
315
\
316
void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
317
{\
318
    int i;\
319
    for(i=0; i<h; i++){\
320
        const uint64_t a= LD64(pixels  );\
321
        const uint64_t b= LD64(pixels+1);\
322
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
323
        pixels+=line_size;\
324
        block +=line_size;\
325
    }\
326
}\
327
\
328
void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
329
{\
330
    int i;\
331
    for(i=0; i<h; i++){\
332
        const uint64_t a= LD64(pixels          );\
333
        const uint64_t b= LD64(pixels+line_size);\
334
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
335
        pixels+=line_size;\
336
        block +=line_size;\
337
    }\
338
}\
339
\
340
void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
341
{\
342
    int i;\
343
    for(i=0; i<h; i++){\
344
        const uint64_t a= LD64(pixels          );\
345
        const uint64_t b= LD64(pixels+line_size);\
346
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
347
        pixels+=line_size;\
348
        block +=line_size;\
349
    }\
350
}\
351
\
352
void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
353
{\
354
        int i;\
355
        const uint64_t a= LD64(pixels  );\
356
        const uint64_t b= LD64(pixels+1);\
357
        uint64_t l0=  (a&0x0303030303030303ULL)\
358
                    + (b&0x0303030303030303ULL)\
359
                    + 0x0202020202020202ULL;\
360
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
362
        uint64_t l1,h1;\
363
\
364
        pixels+=line_size;\
365
        for(i=0; i<h; i+=2){\
366
            uint64_t a= LD64(pixels  );\
367
            uint64_t b= LD64(pixels+1);\
368
            l1=  (a&0x0303030303030303ULL)\
369
               + (b&0x0303030303030303ULL);\
370
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
373
            pixels+=line_size;\
374
            block +=line_size;\
375
            a= LD64(pixels  );\
376
            b= LD64(pixels+1);\
377
            l0=  (a&0x0303030303030303ULL)\
378
               + (b&0x0303030303030303ULL)\
379
               + 0x0202020202020202ULL;\
380
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
381
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
382
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
383
            pixels+=line_size;\
384
            block +=line_size;\
385
        }\
386
}\
387
\
388
void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
389
{\
390
        int i;\
391
        const uint64_t a= LD64(pixels  );\
392
        const uint64_t b= LD64(pixels+1);\
393
        uint64_t l0=  (a&0x0303030303030303ULL)\
394
                    + (b&0x0303030303030303ULL)\
395
                    + 0x0101010101010101ULL;\
396
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
397
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
398
        uint64_t l1,h1;\
399
\
400
        pixels+=line_size;\
401
        for(i=0; i<h; i+=2){\
402
            uint64_t a= LD64(pixels  );\
403
            uint64_t b= LD64(pixels+1);\
404
            l1=  (a&0x0303030303030303ULL)\
405
               + (b&0x0303030303030303ULL);\
406
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
407
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
408
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
409
            pixels+=line_size;\
410
            block +=line_size;\
411
            a= LD64(pixels  );\
412
            b= LD64(pixels+1);\
413
            l0=  (a&0x0303030303030303ULL)\
414
               + (b&0x0303030303030303ULL)\
415
               + 0x0101010101010101ULL;\
416
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
417
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
418
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
419
            pixels+=line_size;\
420
            block +=line_size;\
421
        }\
422
}\
423
\
424
void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
425
    OPNAME ## _pixels,\
426
    OPNAME ## _pixels_x2,\
427
    OPNAME ## _pixels_y2,\
428
    OPNAME ## _pixels_xy2,\
429
};\
430
\
431
void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
432
    OPNAME ## _pixels,\
433
    OPNAME ## _no_rnd_pixels_x2,\
434
    OPNAME ## _no_rnd_pixels_y2,\
435
    OPNAME ## _no_rnd_pixels_xy2,\
436
};
437

438
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
439
#else // 64 bit variant
440

    
441
#define PIXOP2(OPNAME, OP) \
442
void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
443
{\
444
    int i;\
445
    for(i=0; i<h; i++){\
446
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
447
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
448
        pixels+=line_size;\
449
        block +=line_size;\
450
    }\
451
}\
452
\
453
void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
454
{\
455
    int i;\
456
    for(i=0; i<h; i++){\
457
        int j;\
458
        for(j=0; j<2; j++){\
459
            const uint32_t a= LD32(pixels  );\
460
            const uint32_t b= LD32(pixels+1);\
461
            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
462
            pixels+=4;\
463
            block +=4;\
464
        }\
465
        pixels+=line_size-8;\
466
        block +=line_size-8;\
467
    }\
468
}\
469
\
470
void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
471
{\
472
    int i;\
473
    for(i=0; i<h; i++){\
474
        int j;\
475
        for(j=0; j<2; j++){\
476
            const uint32_t a= LD32(pixels  );\
477
            const uint32_t b= LD32(pixels+1);\
478
            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
479
            pixels+=4;\
480
            block +=4;\
481
        }\
482
        pixels+=line_size-8;\
483
        block +=line_size-8;\
484
    }\
485
}\
486
\
487
void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
488
{\
489
    int i;\
490
    for(i=0; i<h; i++){\
491
        int j;\
492
        for(j=0; j<2; j++){\
493
            const uint32_t a= LD32(pixels          );\
494
            const uint32_t b= LD32(pixels+line_size);\
495
            OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
496
            pixels+=4;\
497
            block +=4;\
498
        }\
499
        pixels+=line_size-8;\
500
        block +=line_size-8;\
501
    }\
502
}\
503
\
504
void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
505
{\
506
    int i;\
507
    for(i=0; i<h; i++){\
508
        int j;\
509
        for(j=0; j<2; j++){\
510
            const uint32_t a= LD32(pixels          );\
511
            const uint32_t b= LD32(pixels+line_size);\
512
            OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
513
            pixels+=4;\
514
            block +=4;\
515
        }\
516
        pixels+=line_size-8;\
517
        block +=line_size-8;\
518
    }\
519
}\
520
\
521
void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
522
{\
523
    int j;\
524
    for(j=0; j<2; j++){\
525
        int i;\
526
        const uint32_t a= LD32(pixels  );\
527
        const uint32_t b= LD32(pixels+1);\
528
        uint32_t l0=  (a&0x03030303UL)\
529
                    + (b&0x03030303UL)\
530
                    + 0x02020202UL;\
531
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
532
                   + ((b&0xFCFCFCFCUL)>>2);\
533
        uint32_t l1,h1;\
534
\
535
        pixels+=line_size;\
536
        for(i=0; i<h; i+=2){\
537
            uint32_t a= LD32(pixels  );\
538
            uint32_t b= LD32(pixels+1);\
539
            l1=  (a&0x03030303UL)\
540
               + (b&0x03030303UL);\
541
            h1= ((a&0xFCFCFCFCUL)>>2)\
542
              + ((b&0xFCFCFCFCUL)>>2);\
543
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
544
            pixels+=line_size;\
545
            block +=line_size;\
546
            a= LD32(pixels  );\
547
            b= LD32(pixels+1);\
548
            l0=  (a&0x03030303UL)\
549
               + (b&0x03030303UL)\
550
               + 0x02020202UL;\
551
            h0= ((a&0xFCFCFCFCUL)>>2)\
552
              + ((b&0xFCFCFCFCUL)>>2);\
553
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
554
            pixels+=line_size;\
555
            block +=line_size;\
556
        }\
557
        pixels+=4-line_size*(h+1);\
558
        block +=4-line_size*h;\
559
    }\
560
}\
561
\
562
void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
563
{\
564
    int j;\
565
    for(j=0; j<2; j++){\
566
        int i;\
567
        const uint32_t a= LD32(pixels  );\
568
        const uint32_t b= LD32(pixels+1);\
569
        uint32_t l0=  (a&0x03030303UL)\
570
                    + (b&0x03030303UL)\
571
                    + 0x01010101UL;\
572
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
573
                   + ((b&0xFCFCFCFCUL)>>2);\
574
        uint32_t l1,h1;\
575
\
576
        pixels+=line_size;\
577
        for(i=0; i<h; i+=2){\
578
            uint32_t a= LD32(pixels  );\
579
            uint32_t b= LD32(pixels+1);\
580
            l1=  (a&0x03030303UL)\
581
               + (b&0x03030303UL);\
582
            h1= ((a&0xFCFCFCFCUL)>>2)\
583
              + ((b&0xFCFCFCFCUL)>>2);\
584
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
585
            pixels+=line_size;\
586
            block +=line_size;\
587
            a= LD32(pixels  );\
588
            b= LD32(pixels+1);\
589
            l0=  (a&0x03030303UL)\
590
               + (b&0x03030303UL)\
591
               + 0x01010101UL;\
592
            h0= ((a&0xFCFCFCFCUL)>>2)\
593
              + ((b&0xFCFCFCFCUL)>>2);\
594
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
595
            pixels+=line_size;\
596
            block +=line_size;\
597
        }\
598
        pixels+=4-line_size*(h+1);\
599
        block +=4-line_size*h;\
600
    }\
601
}\
602
\
603
void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
604
    OPNAME ## _pixels,\
605
    OPNAME ## _pixels_x2,\
606
    OPNAME ## _pixels_y2,\
607
    OPNAME ## _pixels_xy2,\
608
};\
609
\
610
void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
611
    OPNAME ## _pixels,\
612
    OPNAME ## _no_rnd_pixels_x2,\
613
    OPNAME ## _no_rnd_pixels_y2,\
614
    OPNAME ## _no_rnd_pixels_xy2,\
615
};
616
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
617
#endif
618
#define op_put(a, b) a = b
619

    
620
PIXOP2(avg, op_avg)
621
PIXOP2(put, op_put)
622
#undef op_avg
623
#undef op_put
624

    
625
#if 0
626
/* FIXME this stuff could be removed as its ot really used anymore */
627
#define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
628
                                                                                         \
629
static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
630
{                                                                                        \
631
    BTYPE *p;                                                                            \
632
    const UINT8 *pix;                                                                    \
633
                                                                                         \
634
    p = block;                                                                           \
635
    pix = pixels;                                                                        \
636
    do {                                                                                 \
637
        OP(p[0], pix[0]);                                                                  \
638
        OP(p[1], pix[1]);                                                                  \
639
        OP(p[2], pix[2]);                                                                  \
640
        OP(p[3], pix[3]);                                                                  \
641
        OP(p[4], pix[4]);                                                                  \
642
        OP(p[5], pix[5]);                                                                  \
643
        OP(p[6], pix[6]);                                                                  \
644
        OP(p[7], pix[7]);                                                                  \
645
        pix += line_size;                                                                \
646
        p += INCR;                                                                       \
647
    } while (--h);;                                                                       \
648
}                                                                                        \
649
                                                                                         \
650
static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
651
{                                                                                        \
652
    BTYPE *p;                                                                          \
653
    const UINT8 *pix;                                                                    \
654
                                                                                         \
655
    p = block;                                                                           \
656
    pix = pixels;                                                                        \
657
    do {                                                                   \
658
        OP(p[0], avg2(pix[0], pix[1]));                                                    \
659
        OP(p[1], avg2(pix[1], pix[2]));                                                    \
660
        OP(p[2], avg2(pix[2], pix[3]));                                                    \
661
        OP(p[3], avg2(pix[3], pix[4]));                                                    \
662
        OP(p[4], avg2(pix[4], pix[5]));                                                    \
663
        OP(p[5], avg2(pix[5], pix[6]));                                                    \
664
        OP(p[6], avg2(pix[6], pix[7]));                                                    \
665
        OP(p[7], avg2(pix[7], pix[8]));                                                    \
666
        pix += line_size;                                                                \
667
        p += INCR;                                                                       \
668
    } while (--h);                                                                        \
669
}                                                                                        \
670
                                                                                         \
671
static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
672
{                                                                                        \
673
    BTYPE *p;                                                                          \
674
    const UINT8 *pix;                                                                    \
675
    const UINT8 *pix1;                                                                   \
676
                                                                                         \
677
    p = block;                                                                           \
678
    pix = pixels;                                                                        \
679
    pix1 = pixels + line_size;                                                           \
680
    do {                                                                                 \
681
        OP(p[0], avg2(pix[0], pix1[0]));                                                   \
682
        OP(p[1], avg2(pix[1], pix1[1]));                                                   \
683
        OP(p[2], avg2(pix[2], pix1[2]));                                                   \
684
        OP(p[3], avg2(pix[3], pix1[3]));                                                   \
685
        OP(p[4], avg2(pix[4], pix1[4]));                                                   \
686
        OP(p[5], avg2(pix[5], pix1[5]));                                                   \
687
        OP(p[6], avg2(pix[6], pix1[6]));                                                   \
688
        OP(p[7], avg2(pix[7], pix1[7]));                                                   \
689
        pix += line_size;                                                                \
690
        pix1 += line_size;                                                               \
691
        p += INCR;                                                                       \
692
    } while(--h);                                                                         \
693
}                                                                                        \
694
                                                                                         \
695
static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
696
{                                                                                        \
697
    BTYPE *p;                                                                          \
698
    const UINT8 *pix;                                                                    \
699
    const UINT8 *pix1;                                                                   \
700
                                                                                         \
701
    p = block;                                                                           \
702
    pix = pixels;                                                                        \
703
    pix1 = pixels + line_size;                                                           \
704
    do {                                                                   \
705
        OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
706
        OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
707
        OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
708
        OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
709
        OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
710
        OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
711
        OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
712
        OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
713
        pix += line_size;                                                                \
714
        pix1 += line_size;                                                               \
715
        p += INCR;                                                                       \
716
    } while(--h);                                                                         \
717
}                                                                                        \
718
                                                                                         \
719
void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
720
    OPNAME ## _pixels,                                                                   \
721
    OPNAME ## _pixels_x2,                                                                \
722
    OPNAME ## _pixels_y2,                                                                \
723
    OPNAME ## _pixels_xy2,                                                               \
724
};
725

726
/* rounding primitives */
727
#define avg2(a,b) ((a+b+1)>>1)
728
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
729

730
#define op_avg(a, b) a = avg2(a, b)
731
#define op_sub(a, b) a -= b
732
#define op_put(a, b) a = b
733

734
PIXOP(DCTELEM, sub, op_sub, 8)
735
PIXOP(uint8_t, avg, op_avg, line_size)
736
PIXOP(uint8_t, put, op_put, line_size)
737

738
/* not rounding primitives */
739
#undef avg2
740
#undef avg4
741
#define avg2(a,b) ((a+b)>>1)
742
#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
743

744
PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
745
PIXOP(uint8_t, put_no_rnd, op_put, line_size)
746
/* motion estimation */
747

748
#undef avg2
749
#undef avg4
750
#endif
751

    
752
#define avg2(a,b) ((a+b+1)>>1)
753
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
754

    
755
static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
756
{
757
    const int A=(16-x16)*(16-y16);
758
    const int B=(   x16)*(16-y16);
759
    const int C=(16-x16)*(   y16);
760
    const int D=(   x16)*(   y16);
761
    int i;
762
    rounder= 128 - rounder;
763

    
764
    for(i=0; i<h; i++)
765
    {
766
        dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
767
        dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
768
        dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
769
        dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
770
        dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
771
        dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
772
        dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
773
        dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
774
        dst+= srcStride;
775
        src+= srcStride;
776
    }
777
}
778

    
779
static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
780
{
781
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
782
    int i;
783
    for(i=0; i<h; i++)
784
    {
785
        dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
786
        dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
787
        dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
788
        dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
789
        dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
790
        dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
791
        dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
792
        dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
793
        dst+=dstStride;
794
        src+=srcStride;
795
    }
796
}
797

    
798
static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
799
{
800
    UINT8 *cm = cropTbl + MAX_NEG_CROP;
801
    int i;
802
    for(i=0; i<w; i++)
803
    {
804
        const int src0= src[0*srcStride];
805
        const int src1= src[1*srcStride];
806
        const int src2= src[2*srcStride];
807
        const int src3= src[3*srcStride];
808
        const int src4= src[4*srcStride];
809
        const int src5= src[5*srcStride];
810
        const int src6= src[6*srcStride];
811
        const int src7= src[7*srcStride];
812
        const int src8= src[8*srcStride];
813
        dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
814
        dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
815
        dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
816
        dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
817
        dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
818
        dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
819
        dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
820
        dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
821
        dst++;
822
        src++;
823
    }
824
}
825

    
826
static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
827
{
828
    int i;
829
    for(i=0; i<8; i++)
830
    {
831
        dst[0]= src[0];
832
        dst[1]= src[1];
833
        dst[2]= src[2];
834
        dst[3]= src[3];
835
        dst[4]= src[4];
836
        dst[5]= src[5];
837
        dst[6]= src[6];
838
        dst[7]= src[7];
839
        dst+=dstStride;
840
        src+=srcStride;
841
    }
842
}
843

    
844
static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
845
{
846
    int i;
847
    for(i=0; i<8; i++)
848
    {
849
        dst[0]= (src1[0] + src2[0] + r)>>1;
850
        dst[1]= (src1[1] + src2[1] + r)>>1;
851
        dst[2]= (src1[2] + src2[2] + r)>>1;
852
        dst[3]= (src1[3] + src2[3] + r)>>1;
853
        dst[4]= (src1[4] + src2[4] + r)>>1;
854
        dst[5]= (src1[5] + src2[5] + r)>>1;
855
        dst[6]= (src1[6] + src2[6] + r)>>1;
856
        dst[7]= (src1[7] + src2[7] + r)>>1;
857
        dst+=dstStride;
858
        src1+=srcStride;
859
        src2+=8;
860
    }
861
}
862

    
863
static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
864
{
865
    int i;
866
    for(i=0; i<8; i++)
867
    {
868
        dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
869
        dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
870
        dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
871
        dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
872
        dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
873
        dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
874
        dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
875
        dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
876
        dst+=dstStride;
877
        src1+=srcStride;
878
        src2+=8;
879
        src3+=8;
880
        src4+=8;
881
    }
882
}
883

    
884
#define QPEL_MC(r, name) \
885
static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
886
{\
887
    put_block(dst, src, dstStride, srcStride);\
888
}\
889
\
890
static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
891
{\
892
    UINT8 half[64];\
893
    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
894
    avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
895
}\
896
\
897
static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
898
{\
899
    qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
900
}\
901
\
902
static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
903
{\
904
    UINT8 half[64];\
905
    qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
906
    avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
907
}\
908
\
909
static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
910
{\
911
    UINT8 half[64];\
912
    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
913
    avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
914
}\
915
\
916
static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
917
{\
918
    qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
919
}\
920
\
921
static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
922
{\
923
    UINT8 half[64];\
924
    qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
925
    avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
926
}\
927
static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
928
{\
929
    UINT8 halfH[72];\
930
    UINT8 halfV[64];\
931
    UINT8 halfHV[64];\
932
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
933
    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
934
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
935
    avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
936
}\
937
static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
938
{\
939
    UINT8 halfH[72];\
940
    UINT8 halfV[64];\
941
    UINT8 halfHV[64];\
942
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
943
    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
944
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
945
    avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
946
}\
947
static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
948
{\
949
    UINT8 halfH[72];\
950
    UINT8 halfV[64];\
951
    UINT8 halfHV[64];\
952
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
953
    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
954
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
955
    avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
956
}\
957
static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
958
{\
959
    UINT8 halfH[72];\
960
    UINT8 halfV[64];\
961
    UINT8 halfHV[64];\
962
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
963
    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
964
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
965
    avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
966
}\
967
static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
968
{\
969
    UINT8 halfH[72];\
970
    UINT8 halfHV[64];\
971
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
972
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
973
    avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
974
}\
975
static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
976
{\
977
    UINT8 halfH[72];\
978
    UINT8 halfHV[64];\
979
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
980
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
981
    avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
982
}\
983
static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
984
{\
985
    UINT8 halfH[72];\
986
    UINT8 halfV[64];\
987
    UINT8 halfHV[64];\
988
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
989
    qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
990
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
991
    avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
992
}\
993
static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
994
{\
995
    UINT8 halfH[72];\
996
    UINT8 halfV[64];\
997
    UINT8 halfHV[64];\
998
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
999
    qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
1000
    qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
1001
    avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
1002
}\
1003
static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
1004
{\
1005
    UINT8 halfH[72];\
1006
    qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1007
    qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
1008
}\
1009
qpel_mc_func qpel_mc ## name ## _tab[16]={ \
1010
    qpel_mc00_c ## name,                                                                   \
1011
    qpel_mc10_c ## name,                                                                   \
1012
    qpel_mc20_c ## name,                                                                   \
1013
    qpel_mc30_c ## name,                                                                   \
1014
    qpel_mc01_c ## name,                                                                   \
1015
    qpel_mc11_c ## name,                                                                   \
1016
    qpel_mc21_c ## name,                                                                   \
1017
    qpel_mc31_c ## name,                                                                   \
1018
    qpel_mc02_c ## name,                                                                   \
1019
    qpel_mc12_c ## name,                                                                   \
1020
    qpel_mc22_c ## name,                                                                   \
1021
    qpel_mc32_c ## name,                                                                   \
1022
    qpel_mc03_c ## name,                                                                   \
1023
    qpel_mc13_c ## name,                                                                   \
1024
    qpel_mc23_c ## name,                                                                   \
1025
    qpel_mc33_c ## name,                                                                   \
1026
};
1027

    
1028
QPEL_MC(0, _rnd)
1029
QPEL_MC(1, _no_rnd)
1030

    
1031
int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1032
{
1033
    int s, i;
1034

    
1035
    s = 0;
1036
    for(i=0;i<16;i++) {
1037
        s += abs(pix1[0] - pix2[0]);
1038
        s += abs(pix1[1] - pix2[1]);
1039
        s += abs(pix1[2] - pix2[2]);
1040
        s += abs(pix1[3] - pix2[3]);
1041
        s += abs(pix1[4] - pix2[4]);
1042
        s += abs(pix1[5] - pix2[5]);
1043
        s += abs(pix1[6] - pix2[6]);
1044
        s += abs(pix1[7] - pix2[7]);
1045
        s += abs(pix1[8] - pix2[8]);
1046
        s += abs(pix1[9] - pix2[9]);
1047
        s += abs(pix1[10] - pix2[10]);
1048
        s += abs(pix1[11] - pix2[11]);
1049
        s += abs(pix1[12] - pix2[12]);
1050
        s += abs(pix1[13] - pix2[13]);
1051
        s += abs(pix1[14] - pix2[14]);
1052
        s += abs(pix1[15] - pix2[15]);
1053
        pix1 += line_size;
1054
        pix2 += line_size;
1055
    }
1056
    return s;
1057
}
1058

    
1059
int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1060
{
1061
    int s, i;
1062

    
1063
    s = 0;
1064
    for(i=0;i<16;i++) {
1065
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1066
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1067
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1068
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1069
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1070
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1071
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1072
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1073
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1074
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1075
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1076
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1077
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1078
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1079
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1080
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1081
        pix1 += line_size;
1082
        pix2 += line_size;
1083
    }
1084
    return s;
1085
}
1086

    
1087
int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1088
{
1089
    int s, i;
1090
    UINT8 *pix3 = pix2 + line_size;
1091

    
1092
    s = 0;
1093
    for(i=0;i<16;i++) {
1094
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1095
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1096
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1097
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1098
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1099
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1100
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1101
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1102
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1103
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1104
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1105
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1106
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1107
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1108
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1109
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1110
        pix1 += line_size;
1111
        pix2 += line_size;
1112
        pix3 += line_size;
1113
    }
1114
    return s;
1115
}
1116

    
1117
int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1118
{
1119
    int s, i;
1120
    UINT8 *pix3 = pix2 + line_size;
1121

    
1122
    s = 0;
1123
    for(i=0;i<16;i++) {
1124
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1125
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1126
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1127
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1128
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1129
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1130
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1131
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1132
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1133
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1134
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1135
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1136
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1137
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1138
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1139
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1140
        pix1 += line_size;
1141
        pix2 += line_size;
1142
        pix3 += line_size;
1143
    }
1144
    return s;
1145
}
1146

    
1147
int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148
{
1149
    int s, i;
1150

    
1151
    s = 0;
1152
    for(i=0;i<8;i++) {
1153
        s += abs(pix1[0] - pix2[0]);
1154
        s += abs(pix1[1] - pix2[1]);
1155
        s += abs(pix1[2] - pix2[2]);
1156
        s += abs(pix1[3] - pix2[3]);
1157
        s += abs(pix1[4] - pix2[4]);
1158
        s += abs(pix1[5] - pix2[5]);
1159
        s += abs(pix1[6] - pix2[6]);
1160
        s += abs(pix1[7] - pix2[7]);
1161
        pix1 += line_size;
1162
        pix2 += line_size;
1163
    }
1164
    return s;
1165
}
1166

    
1167
int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1168
{
1169
    int s, i;
1170

    
1171
    s = 0;
1172
    for(i=0;i<8;i++) {
1173
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1174
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1175
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1176
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1177
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1178
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1179
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1180
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1181
        pix1 += line_size;
1182
        pix2 += line_size;
1183
    }
1184
    return s;
1185
}
1186

    
1187
int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1188
{
1189
    int s, i;
1190
    UINT8 *pix3 = pix2 + line_size;
1191

    
1192
    s = 0;
1193
    for(i=0;i<8;i++) {
1194
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1195
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1196
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1197
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1198
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1199
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1200
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1201
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1202
        pix1 += line_size;
1203
        pix2 += line_size;
1204
        pix3 += line_size;
1205
    }
1206
    return s;
1207
}
1208

    
1209
int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1210
{
1211
    int s, i;
1212
    UINT8 *pix3 = pix2 + line_size;
1213

    
1214
    s = 0;
1215
    for(i=0;i<8;i++) {
1216
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1217
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1218
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1219
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1220
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1221
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1222
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1223
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1224
        pix1 += line_size;
1225
        pix2 += line_size;
1226
        pix3 += line_size;
1227
    }
1228
    return s;
1229
}
1230

    
1231
/* permute block according so that it corresponds to the MMX idct
1232
   order */
1233
#ifdef SIMPLE_IDCT
1234
 /* general permutation, but perhaps slightly slower */
1235
void block_permute(INT16 *block)
1236
{
1237
        int i;
1238
        INT16 temp[64];
1239

    
1240
        for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1241

    
1242
        for(i=0; i<64; i++) block[i] = temp[i];
1243
}
1244
#else
1245

    
1246
void block_permute(INT16 *block)
1247
{
1248
    int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1249
    int i;
1250

    
1251
    for(i=0;i<8;i++) {
1252
        tmp1 = block[1];
1253
        tmp2 = block[2];
1254
        tmp3 = block[3];
1255
        tmp4 = block[4];
1256
        tmp5 = block[5];
1257
        tmp6 = block[6];
1258
        block[1] = tmp2;
1259
        block[2] = tmp4;
1260
        block[3] = tmp6;
1261
        block[4] = tmp1;
1262
        block[5] = tmp3;
1263
        block[6] = tmp5;
1264
        block += 8;
1265
    }
1266
}
1267
#endif
1268

    
1269
void clear_blocks_c(DCTELEM *blocks)
1270
{
1271
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1272
}
1273

    
1274
/* XXX: those functions should be suppressed ASAP when all IDCTs are
1275
   converted */
1276
void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1277
{
1278
    ff_idct (block);
1279
    put_pixels_clamped(block, dest, line_size);
1280
}
1281

    
1282
void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1283
{
1284
    ff_idct (block);
1285
    add_pixels_clamped(block, dest, line_size);
1286
}
1287

    
1288
void dsputil_init(void)
1289
{
1290
    int i, j;
1291
    int use_permuted_idct;
1292

    
1293
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1294
    for(i=0;i<MAX_NEG_CROP;i++) {
1295
        cropTbl[i] = 0;
1296
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
1297
    }
1298

    
1299
    for(i=0;i<512;i++) {
1300
        squareTbl[i] = (i - 256) * (i - 256);
1301
    }
1302

    
1303
#ifdef SIMPLE_IDCT
1304
    ff_idct = NULL;
1305
#else
1306
    ff_idct = j_rev_dct;
1307
#endif
1308
    get_pixels = get_pixels_c;
1309
    diff_pixels = diff_pixels_c;
1310
    put_pixels_clamped = put_pixels_clamped_c;
1311
    add_pixels_clamped = add_pixels_clamped_c;
1312
    gmc1= gmc1_c;
1313
    clear_blocks= clear_blocks_c;
1314
    pix_sum= pix_sum_c;
1315
    pix_norm1= pix_norm1_c;
1316

    
1317
    pix_abs16x16     = pix_abs16x16_c;
1318
    pix_abs16x16_x2  = pix_abs16x16_x2_c;
1319
    pix_abs16x16_y2  = pix_abs16x16_y2_c;
1320
    pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1321
    pix_abs8x8     = pix_abs8x8_c;
1322
    pix_abs8x8_x2  = pix_abs8x8_x2_c;
1323
    pix_abs8x8_y2  = pix_abs8x8_y2_c;
1324
    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1325

    
1326
    use_permuted_idct = 1;
1327

    
1328
#ifdef HAVE_MMX
1329
    dsputil_init_mmx();
1330
#endif
1331
#ifdef ARCH_ARMV4L
1332
    dsputil_init_armv4l();
1333
#endif
1334
#ifdef HAVE_MLIB
1335
    dsputil_init_mlib();
1336
    use_permuted_idct = 0;
1337
#endif
1338
#ifdef ARCH_ALPHA
1339
    dsputil_init_alpha();
1340
    use_permuted_idct = 0;
1341
#endif
1342
#ifdef ARCH_POWERPC
1343
    dsputil_init_ppc();
1344
#endif
1345

    
1346
#ifdef SIMPLE_IDCT
1347
    if (ff_idct == NULL) {
1348
        ff_idct_put = simple_idct_put;
1349
        ff_idct_add = simple_idct_add;
1350
        use_permuted_idct=0;
1351
    }
1352
#endif
1353
    if(ff_idct != NULL) {
1354
        ff_idct_put = gen_idct_put;
1355
        ff_idct_add = gen_idct_add;
1356
    }
1357

    
1358
    if(use_permuted_idct)
1359
#ifdef SIMPLE_IDCT
1360
        for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1361
#else
1362
        for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1363
#endif
1364
    else
1365
        for(i=0; i<64; i++) permutation[i]=i;
1366

    
1367
    for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1368
    for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1369
    
1370
    if (use_permuted_idct) {
1371
        /* permute for IDCT */
1372
        for(i=0;i<64;i++) {
1373
            j = zigzag_direct[i];
1374
            zigzag_direct[i] = block_permute_op(j);
1375
            j = ff_alternate_horizontal_scan[i];
1376
            ff_alternate_horizontal_scan[i] = block_permute_op(j);
1377
            j = ff_alternate_vertical_scan[i];
1378
            ff_alternate_vertical_scan[i] = block_permute_op(j);
1379
        }
1380
        block_permute(ff_mpeg1_default_intra_matrix);
1381
        block_permute(ff_mpeg1_default_non_intra_matrix);
1382
        block_permute(ff_mpeg4_default_intra_matrix);
1383
        block_permute(ff_mpeg4_default_non_intra_matrix);
1384
    }
1385
    
1386
    build_zigzag_end();
1387
}
1388

    
1389
/* remove any non bit exact operation (testing purpose) */
1390
void avcodec_set_bit_exact(void)
1391
{
1392
#ifdef HAVE_MMX
1393
    dsputil_set_bit_exact_mmx();
1394
#endif
1395
}
1396

    
1397
void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1398
              int orig_linesize[3], int coded_linesize,
1399
              AVCodecContext *avctx)
1400
{
1401
    int quad, diff, x, y;
1402
    UINT8 *orig, *coded;
1403
    UINT32 *sq = squareTbl + 256;
1404
    
1405
    quad = 0;
1406
    diff = 0;
1407
    
1408
    /* Luminance */
1409
    orig = orig_image[0];
1410
    coded = coded_image[0];
1411
    
1412
    for (y=0;y<avctx->height;y++) {
1413
        for (x=0;x<avctx->width;x++) {
1414
            diff = *(orig + x) - *(coded + x);
1415
            quad += sq[diff];
1416
        }
1417
        orig += orig_linesize[0];
1418
        coded += coded_linesize;
1419
    }
1420
   
1421
    avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1422
    
1423
    if (avctx->psnr_y) {
1424
        avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1425
        avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1426
    } else
1427
        avctx->psnr_y = 99.99;
1428
}
1429