Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 0da71265

History | View | Annotate | Download (101 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27
#include "avcodec.h"
28
#include "dsputil.h"
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
31

    
32

    
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35

    
36
const uint8_t ff_zigzag_direct[64] = {
37
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39
    12, 19, 26, 33, 40, 48, 41, 34,
40
    27, 20, 13,  6,  7, 14, 21, 28,
41
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46

    
47
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48
uint16_t __align8 inv_zigzag_direct16[64];
49

    
50
const uint8_t ff_alternate_horizontal_scan[64] = {
51
    0,  1,   2,  3,  8,  9, 16, 17, 
52
    10, 11,  4,  5,  6,  7, 15, 14,
53
    13, 12, 19, 18, 24, 25, 32, 33, 
54
    26, 27, 20, 21, 22, 23, 28, 29,
55
    30, 31, 34, 35, 40, 41, 48, 49, 
56
    42, 43, 36, 37, 38, 39, 44, 45,
57
    46, 47, 50, 51, 56, 57, 58, 59, 
58
    52, 53, 54, 55, 60, 61, 62, 63,
59
};
60

    
61
const uint8_t ff_alternate_vertical_scan[64] = {
62
    0,  8,  16, 24,  1,  9,  2, 10, 
63
    17, 25, 32, 40, 48, 56, 57, 49,
64
    41, 33, 26, 18,  3, 11,  4, 12, 
65
    19, 27, 34, 42, 50, 58, 35, 43,
66
    51, 59, 20, 28,  5, 13,  6, 14, 
67
    21, 29, 36, 44, 52, 60, 37, 45,
68
    53, 61, 22, 30,  7, 15, 23, 31, 
69
    38, 46, 54, 62, 39, 47, 55, 63,
70
};
71

    
72
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73
const uint32_t inverse[256]={
74
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
75
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
76
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
77
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
78
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
79
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
80
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
81
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
82
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
83
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
84
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
85
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
86
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
87
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
88
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
89
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
90
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
91
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
92
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
93
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
94
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
95
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
96
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
97
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
98
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
99
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
100
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
101
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
102
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
103
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
104
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
105
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
106
};
107

    
108
/* Input permutation for the simple_idct_mmx */
109
static const uint8_t simple_mmx_permutation[64]={
110
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
111
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
112
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
113
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
114
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
115
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
116
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
117
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118
};
119

    
120
static int pix_sum_c(uint8_t * pix, int line_size)
121
{
122
    int s, i, j;
123

    
124
    s = 0;
125
    for (i = 0; i < 16; i++) {
126
        for (j = 0; j < 16; j += 8) {
127
            s += pix[0];
128
            s += pix[1];
129
            s += pix[2];
130
            s += pix[3];
131
            s += pix[4];
132
            s += pix[5];
133
            s += pix[6];
134
            s += pix[7];
135
            pix += 8;
136
        }
137
        pix += line_size - 16;
138
    }
139
    return s;
140
}
141

    
142
static int pix_norm1_c(uint8_t * pix, int line_size)
143
{
144
    int s, i, j;
145
    uint32_t *sq = squareTbl + 256;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
#if 0
151
            s += sq[pix[0]];
152
            s += sq[pix[1]];
153
            s += sq[pix[2]];
154
            s += sq[pix[3]];
155
            s += sq[pix[4]];
156
            s += sq[pix[5]];
157
            s += sq[pix[6]];
158
            s += sq[pix[7]];
159
#else
160
#if LONG_MAX > 2147483647
161
            register uint64_t x=*(uint64_t*)pix;
162
            s += sq[x&0xff];
163
            s += sq[(x>>8)&0xff];
164
            s += sq[(x>>16)&0xff];
165
            s += sq[(x>>24)&0xff];
166
            s += sq[(x>>32)&0xff];
167
            s += sq[(x>>40)&0xff];
168
            s += sq[(x>>48)&0xff];
169
            s += sq[(x>>56)&0xff];
170
#else
171
            register uint32_t x=*(uint32_t*)pix;
172
            s += sq[x&0xff];
173
            s += sq[(x>>8)&0xff];
174
            s += sq[(x>>16)&0xff];
175
            s += sq[(x>>24)&0xff];
176
            x=*(uint32_t*)(pix+4);
177
            s += sq[x&0xff];
178
            s += sq[(x>>8)&0xff];
179
            s += sq[(x>>16)&0xff];
180
            s += sq[(x>>24)&0xff];
181
#endif
182
#endif
183
            pix += 8;
184
        }
185
        pix += line_size - 16;
186
    }
187
    return s;
188
}
189

    
190

    
191
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
192
{
193
    int s, i;
194
    uint32_t *sq = squareTbl + 256;
195

    
196
    s = 0;
197
    for (i = 0; i < 8; i++) {
198
        s += sq[pix1[0] - pix2[0]];
199
        s += sq[pix1[1] - pix2[1]];
200
        s += sq[pix1[2] - pix2[2]];
201
        s += sq[pix1[3] - pix2[3]];
202
        s += sq[pix1[4] - pix2[4]];
203
        s += sq[pix1[5] - pix2[5]];
204
        s += sq[pix1[6] - pix2[6]];
205
        s += sq[pix1[7] - pix2[7]];
206
        pix1 += line_size;
207
        pix2 += line_size;
208
    }
209
    return s;
210
}
211

    
212
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
213
{
214
    int s, i;
215
    uint32_t *sq = squareTbl + 256;
216

    
217
    s = 0;
218
    for (i = 0; i < 16; i++) {
219
        s += sq[pix1[ 0] - pix2[ 0]];
220
        s += sq[pix1[ 1] - pix2[ 1]];
221
        s += sq[pix1[ 2] - pix2[ 2]];
222
        s += sq[pix1[ 3] - pix2[ 3]];
223
        s += sq[pix1[ 4] - pix2[ 4]];
224
        s += sq[pix1[ 5] - pix2[ 5]];
225
        s += sq[pix1[ 6] - pix2[ 6]];
226
        s += sq[pix1[ 7] - pix2[ 7]];
227
        s += sq[pix1[ 8] - pix2[ 8]];
228
        s += sq[pix1[ 9] - pix2[ 9]];
229
        s += sq[pix1[10] - pix2[10]];
230
        s += sq[pix1[11] - pix2[11]];
231
        s += sq[pix1[12] - pix2[12]];
232
        s += sq[pix1[13] - pix2[13]];
233
        s += sq[pix1[14] - pix2[14]];
234
        s += sq[pix1[15] - pix2[15]];
235

    
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
243
{
244
    int i;
245

    
246
    /* read the pixels */
247
    for(i=0;i<8;i++) {
248
        block[0] = pixels[0];
249
        block[1] = pixels[1];
250
        block[2] = pixels[2];
251
        block[3] = pixels[3];
252
        block[4] = pixels[4];
253
        block[5] = pixels[5];
254
        block[6] = pixels[6];
255
        block[7] = pixels[7];
256
        pixels += line_size;
257
        block += 8;
258
    }
259
}
260

    
261
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262
                          const uint8_t *s2, int stride){
263
    int i;
264

    
265
    /* read the pixels */
266
    for(i=0;i<8;i++) {
267
        block[0] = s1[0] - s2[0];
268
        block[1] = s1[1] - s2[1];
269
        block[2] = s1[2] - s2[2];
270
        block[3] = s1[3] - s2[3];
271
        block[4] = s1[4] - s2[4];
272
        block[5] = s1[5] - s2[5];
273
        block[6] = s1[6] - s2[6];
274
        block[7] = s1[7] - s2[7];
275
        s1 += stride;
276
        s2 += stride;
277
        block += 8;
278
    }
279
}
280

    
281

    
282
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
283
                                 int line_size)
284
{
285
    int i;
286
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
287
    
288
    /* read the pixels */
289
    for(i=0;i<8;i++) {
290
        pixels[0] = cm[block[0]];
291
        pixels[1] = cm[block[1]];
292
        pixels[2] = cm[block[2]];
293
        pixels[3] = cm[block[3]];
294
        pixels[4] = cm[block[4]];
295
        pixels[5] = cm[block[5]];
296
        pixels[6] = cm[block[6]];
297
        pixels[7] = cm[block[7]];
298

    
299
        pixels += line_size;
300
        block += 8;
301
    }
302
}
303

    
304
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
305
                          int line_size)
306
{
307
    int i;
308
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
309
    
310
    /* read the pixels */
311
    for(i=0;i<8;i++) {
312
        pixels[0] = cm[pixels[0] + block[0]];
313
        pixels[1] = cm[pixels[1] + block[1]];
314
        pixels[2] = cm[pixels[2] + block[2]];
315
        pixels[3] = cm[pixels[3] + block[3]];
316
        pixels[4] = cm[pixels[4] + block[4]];
317
        pixels[5] = cm[pixels[5] + block[5]];
318
        pixels[6] = cm[pixels[6] + block[6]];
319
        pixels[7] = cm[pixels[7] + block[7]];
320
        pixels += line_size;
321
        block += 8;
322
    }
323
}
324
#if 0
325

326
#define PIXOP2(OPNAME, OP) \
327
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
328
{\
329
    int i;\
330
    for(i=0; i<h; i++){\
331
        OP(*((uint64_t*)block), LD64(pixels));\
332
        pixels+=line_size;\
333
        block +=line_size;\
334
    }\
335
}\
336
\
337
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
338
{\
339
    int i;\
340
    for(i=0; i<h; i++){\
341
        const uint64_t a= LD64(pixels  );\
342
        const uint64_t b= LD64(pixels+1);\
343
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
344
        pixels+=line_size;\
345
        block +=line_size;\
346
    }\
347
}\
348
\
349
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
350
{\
351
    int i;\
352
    for(i=0; i<h; i++){\
353
        const uint64_t a= LD64(pixels  );\
354
        const uint64_t b= LD64(pixels+1);\
355
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
356
        pixels+=line_size;\
357
        block +=line_size;\
358
    }\
359
}\
360
\
361
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
362
{\
363
    int i;\
364
    for(i=0; i<h; i++){\
365
        const uint64_t a= LD64(pixels          );\
366
        const uint64_t b= LD64(pixels+line_size);\
367
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
368
        pixels+=line_size;\
369
        block +=line_size;\
370
    }\
371
}\
372
\
373
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
374
{\
375
    int i;\
376
    for(i=0; i<h; i++){\
377
        const uint64_t a= LD64(pixels          );\
378
        const uint64_t b= LD64(pixels+line_size);\
379
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380
        pixels+=line_size;\
381
        block +=line_size;\
382
    }\
383
}\
384
\
385
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
386
{\
387
        int i;\
388
        const uint64_t a= LD64(pixels  );\
389
        const uint64_t b= LD64(pixels+1);\
390
        uint64_t l0=  (a&0x0303030303030303ULL)\
391
                    + (b&0x0303030303030303ULL)\
392
                    + 0x0202020202020202ULL;\
393
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395
        uint64_t l1,h1;\
396
\
397
        pixels+=line_size;\
398
        for(i=0; i<h; i+=2){\
399
            uint64_t a= LD64(pixels  );\
400
            uint64_t b= LD64(pixels+1);\
401
            l1=  (a&0x0303030303030303ULL)\
402
               + (b&0x0303030303030303ULL);\
403
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
406
            pixels+=line_size;\
407
            block +=line_size;\
408
            a= LD64(pixels  );\
409
            b= LD64(pixels+1);\
410
            l0=  (a&0x0303030303030303ULL)\
411
               + (b&0x0303030303030303ULL)\
412
               + 0x0202020202020202ULL;\
413
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
416
            pixels+=line_size;\
417
            block +=line_size;\
418
        }\
419
}\
420
\
421
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
422
{\
423
        int i;\
424
        const uint64_t a= LD64(pixels  );\
425
        const uint64_t b= LD64(pixels+1);\
426
        uint64_t l0=  (a&0x0303030303030303ULL)\
427
                    + (b&0x0303030303030303ULL)\
428
                    + 0x0101010101010101ULL;\
429
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
431
        uint64_t l1,h1;\
432
\
433
        pixels+=line_size;\
434
        for(i=0; i<h; i+=2){\
435
            uint64_t a= LD64(pixels  );\
436
            uint64_t b= LD64(pixels+1);\
437
            l1=  (a&0x0303030303030303ULL)\
438
               + (b&0x0303030303030303ULL);\
439
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
442
            pixels+=line_size;\
443
            block +=line_size;\
444
            a= LD64(pixels  );\
445
            b= LD64(pixels+1);\
446
            l0=  (a&0x0303030303030303ULL)\
447
               + (b&0x0303030303030303ULL)\
448
               + 0x0101010101010101ULL;\
449
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452
            pixels+=line_size;\
453
            block +=line_size;\
454
        }\
455
}\
456
\
457
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
458
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
464

465
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466
#else // 64 bit variant
467

    
468
#define PIXOP2(OPNAME, OP) \
469
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
470
    int i;\
471
    for(i=0; i<h; i++){\
472
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
473
        pixels+=line_size;\
474
        block +=line_size;\
475
    }\
476
}\
477
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
478
    int i;\
479
    for(i=0; i<h; i++){\
480
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
481
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
482
        pixels+=line_size;\
483
        block +=line_size;\
484
    }\
485
}\
486
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
488
}\
489
\
490
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
491
                                                int src_stride1, int src_stride2, int h){\
492
    int i;\
493
    for(i=0; i<h; i++){\
494
        uint32_t a,b;\
495
        a= LD32(&src1[i*src_stride1  ]);\
496
        b= LD32(&src2[i*src_stride2  ]);\
497
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
498
        a= LD32(&src1[i*src_stride1+4]);\
499
        b= LD32(&src2[i*src_stride2+4]);\
500
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
501
    }\
502
}\
503
\
504
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
505
                                                int src_stride1, int src_stride2, int h){\
506
    int i;\
507
    for(i=0; i<h; i++){\
508
        uint32_t a,b;\
509
        a= LD32(&src1[i*src_stride1  ]);\
510
        b= LD32(&src2[i*src_stride2  ]);\
511
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
512
        a= LD32(&src1[i*src_stride1+4]);\
513
        b= LD32(&src2[i*src_stride2+4]);\
514
        OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
515
    }\
516
}\
517
\
518
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
519
                                                int src_stride1, int src_stride2, int h){\
520
    int i;\
521
    for(i=0; i<h; i++){\
522
        uint32_t a,b;\
523
        a= LD32(&src1[i*src_stride1  ]);\
524
        b= LD32(&src2[i*src_stride2  ]);\
525
        OP(*((uint32_t*)&dst[i*dst_stride  ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
526
    }\
527
}\
528
\
529
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
                                                int src_stride1, int src_stride2, int h){\
531
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
532
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
533
}\
534
\
535
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
536
                                                int src_stride1, int src_stride2, int h){\
537
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
538
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
539
}\
540
\
541
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
542
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
543
}\
544
\
545
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
546
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
547
}\
548
\
549
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
550
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
551
}\
552
\
553
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
554
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
555
}\
556
\
557
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
558
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
559
    int i;\
560
    for(i=0; i<h; i++){\
561
        uint32_t a, b, c, d, l0, l1, h0, h1;\
562
        a= LD32(&src1[i*src_stride1]);\
563
        b= LD32(&src2[i*src_stride2]);\
564
        c= LD32(&src3[i*src_stride3]);\
565
        d= LD32(&src4[i*src_stride4]);\
566
        l0=  (a&0x03030303UL)\
567
           + (b&0x03030303UL)\
568
           + 0x02020202UL;\
569
        h0= ((a&0xFCFCFCFCUL)>>2)\
570
          + ((b&0xFCFCFCFCUL)>>2);\
571
        l1=  (c&0x03030303UL)\
572
           + (d&0x03030303UL);\
573
        h1= ((c&0xFCFCFCFCUL)>>2)\
574
          + ((d&0xFCFCFCFCUL)>>2);\
575
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
576
        a= LD32(&src1[i*src_stride1+4]);\
577
        b= LD32(&src2[i*src_stride2+4]);\
578
        c= LD32(&src3[i*src_stride3+4]);\
579
        d= LD32(&src4[i*src_stride4+4]);\
580
        l0=  (a&0x03030303UL)\
581
           + (b&0x03030303UL)\
582
           + 0x02020202UL;\
583
        h0= ((a&0xFCFCFCFCUL)>>2)\
584
          + ((b&0xFCFCFCFCUL)>>2);\
585
        l1=  (c&0x03030303UL)\
586
           + (d&0x03030303UL);\
587
        h1= ((c&0xFCFCFCFCUL)>>2)\
588
          + ((d&0xFCFCFCFCUL)>>2);\
589
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
590
    }\
591
}\
592
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
593
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
594
    int i;\
595
    for(i=0; i<h; i++){\
596
        uint32_t a, b, c, d, l0, l1, h0, h1;\
597
        a= LD32(&src1[i*src_stride1]);\
598
        b= LD32(&src2[i*src_stride2]);\
599
        c= LD32(&src3[i*src_stride3]);\
600
        d= LD32(&src4[i*src_stride4]);\
601
        l0=  (a&0x03030303UL)\
602
           + (b&0x03030303UL)\
603
           + 0x01010101UL;\
604
        h0= ((a&0xFCFCFCFCUL)>>2)\
605
          + ((b&0xFCFCFCFCUL)>>2);\
606
        l1=  (c&0x03030303UL)\
607
           + (d&0x03030303UL);\
608
        h1= ((c&0xFCFCFCFCUL)>>2)\
609
          + ((d&0xFCFCFCFCUL)>>2);\
610
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
611
        a= LD32(&src1[i*src_stride1+4]);\
612
        b= LD32(&src2[i*src_stride2+4]);\
613
        c= LD32(&src3[i*src_stride3+4]);\
614
        d= LD32(&src4[i*src_stride4+4]);\
615
        l0=  (a&0x03030303UL)\
616
           + (b&0x03030303UL)\
617
           + 0x01010101UL;\
618
        h0= ((a&0xFCFCFCFCUL)>>2)\
619
          + ((b&0xFCFCFCFCUL)>>2);\
620
        l1=  (c&0x03030303UL)\
621
           + (d&0x03030303UL);\
622
        h1= ((c&0xFCFCFCFCUL)>>2)\
623
          + ((d&0xFCFCFCFCUL)>>2);\
624
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625
    }\
626
}\
627
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
628
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
629
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
630
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
631
}\
632
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
633
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
634
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
635
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
636
}\
637
\
638
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
639
{\
640
    int j;\
641
    for(j=0; j<2; j++){\
642
        int i;\
643
        const uint32_t a= LD32(pixels  );\
644
        const uint32_t b= LD32(pixels+1);\
645
        uint32_t l0=  (a&0x03030303UL)\
646
                    + (b&0x03030303UL)\
647
                    + 0x02020202UL;\
648
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
649
                   + ((b&0xFCFCFCFCUL)>>2);\
650
        uint32_t l1,h1;\
651
\
652
        pixels+=line_size;\
653
        for(i=0; i<h; i+=2){\
654
            uint32_t a= LD32(pixels  );\
655
            uint32_t b= LD32(pixels+1);\
656
            l1=  (a&0x03030303UL)\
657
               + (b&0x03030303UL);\
658
            h1= ((a&0xFCFCFCFCUL)>>2)\
659
              + ((b&0xFCFCFCFCUL)>>2);\
660
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661
            pixels+=line_size;\
662
            block +=line_size;\
663
            a= LD32(pixels  );\
664
            b= LD32(pixels+1);\
665
            l0=  (a&0x03030303UL)\
666
               + (b&0x03030303UL)\
667
               + 0x02020202UL;\
668
            h0= ((a&0xFCFCFCFCUL)>>2)\
669
              + ((b&0xFCFCFCFCUL)>>2);\
670
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
671
            pixels+=line_size;\
672
            block +=line_size;\
673
        }\
674
        pixels+=4-line_size*(h+1);\
675
        block +=4-line_size*h;\
676
    }\
677
}\
678
\
679
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
680
{\
681
    int j;\
682
    for(j=0; j<2; j++){\
683
        int i;\
684
        const uint32_t a= LD32(pixels  );\
685
        const uint32_t b= LD32(pixels+1);\
686
        uint32_t l0=  (a&0x03030303UL)\
687
                    + (b&0x03030303UL)\
688
                    + 0x01010101UL;\
689
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
690
                   + ((b&0xFCFCFCFCUL)>>2);\
691
        uint32_t l1,h1;\
692
\
693
        pixels+=line_size;\
694
        for(i=0; i<h; i+=2){\
695
            uint32_t a= LD32(pixels  );\
696
            uint32_t b= LD32(pixels+1);\
697
            l1=  (a&0x03030303UL)\
698
               + (b&0x03030303UL);\
699
            h1= ((a&0xFCFCFCFCUL)>>2)\
700
              + ((b&0xFCFCFCFCUL)>>2);\
701
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
702
            pixels+=line_size;\
703
            block +=line_size;\
704
            a= LD32(pixels  );\
705
            b= LD32(pixels+1);\
706
            l0=  (a&0x03030303UL)\
707
               + (b&0x03030303UL)\
708
               + 0x01010101UL;\
709
            h0= ((a&0xFCFCFCFCUL)>>2)\
710
              + ((b&0xFCFCFCFCUL)>>2);\
711
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
712
            pixels+=line_size;\
713
            block +=line_size;\
714
        }\
715
        pixels+=4-line_size*(h+1);\
716
        block +=4-line_size*h;\
717
    }\
718
}\
719
\
720
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
721
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
722
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
723
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
724
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
725
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
726
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
727
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
728

    
729
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
730
#endif
731
#define op_put(a, b) a = b
732

    
733
PIXOP2(avg, op_avg)
734
PIXOP2(put, op_put)
735
#undef op_avg
736
#undef op_put
737

    
738
#define avg2(a,b) ((a+b+1)>>1)
739
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
740

    
741

    
742
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
743
{
744
    const int A=(16-x16)*(16-y16);
745
    const int B=(   x16)*(16-y16);
746
    const int C=(16-x16)*(   y16);
747
    const int D=(   x16)*(   y16);
748
    int i;
749

    
750
    for(i=0; i<h; i++)
751
    {
752
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
753
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
754
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
755
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
756
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
757
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
758
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
759
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
760
        dst+= stride;
761
        src+= stride;
762
    }
763
}
764

    
765
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
766
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
767
{
768
    int y, vx, vy;
769
    const int s= 1<<shift;
770
    
771
    width--;
772
    height--;
773

    
774
    for(y=0; y<h; y++){
775
        int x;
776

    
777
        vx= ox;
778
        vy= oy;
779
        for(x=0; x<8; x++){ //XXX FIXME optimize
780
            int src_x, src_y, frac_x, frac_y, index;
781

    
782
            src_x= vx>>16;
783
            src_y= vy>>16;
784
            frac_x= src_x&(s-1);
785
            frac_y= src_y&(s-1);
786
            src_x>>=shift;
787
            src_y>>=shift;
788
  
789
            if((unsigned)src_x < width){
790
                if((unsigned)src_y < height){
791
                    index= src_x + src_y*stride;
792
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
793
                                           + src[index       +1]*   frac_x )*(s-frac_y)
794
                                        + (  src[index+stride  ]*(s-frac_x)
795
                                           + src[index+stride+1]*   frac_x )*   frac_y
796
                                        + r)>>(shift*2);
797
                }else{
798
                    index= src_x + clip(src_y, 0, height)*stride;                    
799
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
800
                                          + src[index       +1]*   frac_x )*s
801
                                        + r)>>(shift*2);
802
                }
803
            }else{
804
                if((unsigned)src_y < height){
805
                    index= clip(src_x, 0, width) + src_y*stride;                    
806
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
807
                                           + src[index+stride  ]*   frac_y )*s
808
                                        + r)>>(shift*2);
809
                }else{
810
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
811
                    dst[y*stride + x]=    src[index         ];
812
                }
813
            }
814
            
815
            vx+= dxx;
816
            vy+= dyx;
817
        }
818
        ox += dxy;
819
        oy += dyy;
820
    }
821
}
822
#define H264_CHROMA_MC(OPNAME, OP)\
823
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
824
    const int A=(8-x)*(8-y);\
825
    const int B=(  x)*(8-y);\
826
    const int C=(8-x)*(  y);\
827
    const int D=(  x)*(  y);\
828
    int i;\
829
    \
830
    assert(x<8 && y<8 && x>=0 && y>=0);\
831
\
832
    for(i=0; i<h; i++)\
833
    {\
834
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
835
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
836
        dst+= stride;\
837
        src+= stride;\
838
    }\
839
}\
840
\
841
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
842
    const int A=(8-x)*(8-y);\
843
    const int B=(  x)*(8-y);\
844
    const int C=(8-x)*(  y);\
845
    const int D=(  x)*(  y);\
846
    int i;\
847
    \
848
    assert(x<8 && y<8 && x>=0 && y>=0);\
849
\
850
    for(i=0; i<h; i++)\
851
    {\
852
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
853
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
854
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
855
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
856
        dst+= stride;\
857
        src+= stride;\
858
    }\
859
}\
860
\
861
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
862
    const int A=(8-x)*(8-y);\
863
    const int B=(  x)*(8-y);\
864
    const int C=(8-x)*(  y);\
865
    const int D=(  x)*(  y);\
866
    int i;\
867
    \
868
    assert(x<8 && y<8 && x>=0 && y>=0);\
869
\
870
    for(i=0; i<h; i++)\
871
    {\
872
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
873
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
874
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
875
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
876
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
877
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
878
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
879
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
880
        dst+= stride;\
881
        src+= stride;\
882
    }\
883
}
884

    
885
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
886
#define op_put(a, b) a = (((b) + 32)>>6)
887

    
888
H264_CHROMA_MC(put_       , op_put)
889
H264_CHROMA_MC(avg_       , op_avg)
890
#undef op_avg
891
#undef op_put
892

    
893
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
894
{
895
    int i;
896
    for(i=0; i<h; i++)
897
    {
898
        ST32(dst   , LD32(src   ));
899
        dst+=dstStride;
900
        src+=srcStride;
901
    }
902
}
903

    
904
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
905
{
906
    int i;
907
    for(i=0; i<h; i++)
908
    {
909
        ST32(dst   , LD32(src   ));
910
        ST32(dst+4 , LD32(src+4 ));
911
        dst+=dstStride;
912
        src+=srcStride;
913
    }
914
}
915

    
916
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
917
{
918
    int i;
919
    for(i=0; i<h; i++)
920
    {
921
        ST32(dst   , LD32(src   ));
922
        ST32(dst+4 , LD32(src+4 ));
923
        ST32(dst+8 , LD32(src+8 ));
924
        ST32(dst+12, LD32(src+12));
925
        dst+=dstStride;
926
        src+=srcStride;
927
    }
928
}
929

    
930
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
931
{
932
    int i;
933
    for(i=0; i<h; i++)
934
    {
935
        ST32(dst   , LD32(src   ));
936
        ST32(dst+4 , LD32(src+4 ));
937
        ST32(dst+8 , LD32(src+8 ));
938
        ST32(dst+12, LD32(src+12));
939
        dst[16]= src[16];
940
        dst+=dstStride;
941
        src+=srcStride;
942
    }
943
}
944

    
945
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
946
{
947
    int i;
948
    for(i=0; i<h; i++)
949
    {
950
        ST32(dst   , LD32(src   ));
951
        ST32(dst+4 , LD32(src+4 ));
952
        dst[8]= src[8];
953
        dst+=dstStride;
954
        src+=srcStride;
955
    }
956
}
957

    
958

    
959
#define QPEL_MC(r, OPNAME, RND, OP) \
960
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
961
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
962
    int i;\
963
    for(i=0; i<h; i++)\
964
    {\
965
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
966
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
967
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
968
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
969
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
970
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
971
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
972
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
973
        dst+=dstStride;\
974
        src+=srcStride;\
975
    }\
976
}\
977
\
978
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
979
    const int w=8;\
980
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
981
    int i;\
982
    for(i=0; i<w; i++)\
983
    {\
984
        const int src0= src[0*srcStride];\
985
        const int src1= src[1*srcStride];\
986
        const int src2= src[2*srcStride];\
987
        const int src3= src[3*srcStride];\
988
        const int src4= src[4*srcStride];\
989
        const int src5= src[5*srcStride];\
990
        const int src6= src[6*srcStride];\
991
        const int src7= src[7*srcStride];\
992
        const int src8= src[8*srcStride];\
993
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
994
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
995
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
996
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
997
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
998
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
999
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1000
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1001
        dst++;\
1002
        src++;\
1003
    }\
1004
}\
1005
\
1006
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1007
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1008
    int i;\
1009
    \
1010
    for(i=0; i<h; i++)\
1011
    {\
1012
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1013
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1014
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1015
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1016
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1017
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1018
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1019
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1020
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1021
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1022
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1023
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1024
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1025
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1026
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1027
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1028
        dst+=dstStride;\
1029
        src+=srcStride;\
1030
    }\
1031
}\
1032
\
1033
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1034
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1035
    int i;\
1036
    const int w=16;\
1037
    for(i=0; i<w; i++)\
1038
    {\
1039
        const int src0= src[0*srcStride];\
1040
        const int src1= src[1*srcStride];\
1041
        const int src2= src[2*srcStride];\
1042
        const int src3= src[3*srcStride];\
1043
        const int src4= src[4*srcStride];\
1044
        const int src5= src[5*srcStride];\
1045
        const int src6= src[6*srcStride];\
1046
        const int src7= src[7*srcStride];\
1047
        const int src8= src[8*srcStride];\
1048
        const int src9= src[9*srcStride];\
1049
        const int src10= src[10*srcStride];\
1050
        const int src11= src[11*srcStride];\
1051
        const int src12= src[12*srcStride];\
1052
        const int src13= src[13*srcStride];\
1053
        const int src14= src[14*srcStride];\
1054
        const int src15= src[15*srcStride];\
1055
        const int src16= src[16*srcStride];\
1056
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1057
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1058
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1059
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1060
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1061
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1062
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1063
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1064
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1065
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1066
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1067
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1068
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1069
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1070
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1071
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1072
        dst++;\
1073
        src++;\
1074
    }\
1075
}\
1076
\
1077
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1078
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1079
}\
1080
\
1081
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1082
    uint8_t half[64];\
1083
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1084
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1085
}\
1086
\
1087
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1088
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1089
}\
1090
\
1091
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1092
    uint8_t half[64];\
1093
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1094
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1095
}\
1096
\
1097
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1098
    uint8_t full[16*9];\
1099
    uint8_t half[64];\
1100
    copy_block9(full, src, 16, stride, 9);\
1101
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1102
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1103
}\
1104
\
1105
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1106
    uint8_t full[16*9];\
1107
    copy_block9(full, src, 16, stride, 9);\
1108
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1109
}\
1110
\
1111
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1112
    uint8_t full[16*9];\
1113
    uint8_t half[64];\
1114
    copy_block9(full, src, 16, stride, 9);\
1115
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1116
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1117
}\
1118
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1119
    uint8_t full[16*9];\
1120
    uint8_t halfH[72];\
1121
    uint8_t halfV[64];\
1122
    uint8_t halfHV[64];\
1123
    copy_block9(full, src, 16, stride, 9);\
1124
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1125
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1126
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1127
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1128
}\
1129
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1130
    uint8_t full[16*9];\
1131
    uint8_t halfH[72];\
1132
    uint8_t halfHV[64];\
1133
    copy_block9(full, src, 16, stride, 9);\
1134
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1135
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1136
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1137
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1138
}\
1139
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1140
    uint8_t full[16*9];\
1141
    uint8_t halfH[72];\
1142
    uint8_t halfV[64];\
1143
    uint8_t halfHV[64];\
1144
    copy_block9(full, src, 16, stride, 9);\
1145
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1146
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1147
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1148
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1149
}\
1150
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1151
    uint8_t full[16*9];\
1152
    uint8_t halfH[72];\
1153
    uint8_t halfHV[64];\
1154
    copy_block9(full, src, 16, stride, 9);\
1155
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1156
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1157
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1158
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1159
}\
1160
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161
    uint8_t full[16*9];\
1162
    uint8_t halfH[72];\
1163
    uint8_t halfV[64];\
1164
    uint8_t halfHV[64];\
1165
    copy_block9(full, src, 16, stride, 9);\
1166
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1167
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1168
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1169
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1170
}\
1171
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1172
    uint8_t full[16*9];\
1173
    uint8_t halfH[72];\
1174
    uint8_t halfHV[64];\
1175
    copy_block9(full, src, 16, stride, 9);\
1176
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1177
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1178
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1179
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1180
}\
1181
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1182
    uint8_t full[16*9];\
1183
    uint8_t halfH[72];\
1184
    uint8_t halfV[64];\
1185
    uint8_t halfHV[64];\
1186
    copy_block9(full, src, 16, stride, 9);\
1187
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1188
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1189
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1190
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1191
}\
1192
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1193
    uint8_t full[16*9];\
1194
    uint8_t halfH[72];\
1195
    uint8_t halfHV[64];\
1196
    copy_block9(full, src, 16, stride, 9);\
1197
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1198
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1199
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1200
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1201
}\
1202
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1203
    uint8_t halfH[72];\
1204
    uint8_t halfHV[64];\
1205
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1206
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1207
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1208
}\
1209
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1210
    uint8_t halfH[72];\
1211
    uint8_t halfHV[64];\
1212
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1213
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1214
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1215
}\
1216
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1217
    uint8_t full[16*9];\
1218
    uint8_t halfH[72];\
1219
    uint8_t halfV[64];\
1220
    uint8_t halfHV[64];\
1221
    copy_block9(full, src, 16, stride, 9);\
1222
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1223
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1224
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1225
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1226
}\
1227
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1228
    uint8_t full[16*9];\
1229
    uint8_t halfH[72];\
1230
    copy_block9(full, src, 16, stride, 9);\
1231
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1232
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1233
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1234
}\
1235
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1236
    uint8_t full[16*9];\
1237
    uint8_t halfH[72];\
1238
    uint8_t halfV[64];\
1239
    uint8_t halfHV[64];\
1240
    copy_block9(full, src, 16, stride, 9);\
1241
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1242
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1243
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1244
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1245
}\
1246
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1247
    uint8_t full[16*9];\
1248
    uint8_t halfH[72];\
1249
    copy_block9(full, src, 16, stride, 9);\
1250
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1251
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1252
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1253
}\
1254
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1255
    uint8_t halfH[72];\
1256
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1257
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1258
}\
1259
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1260
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1261
}\
1262
\
1263
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1264
    uint8_t half[256];\
1265
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1266
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1267
}\
1268
\
1269
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1270
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1271
}\
1272
\
1273
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1274
    uint8_t half[256];\
1275
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1276
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1277
}\
1278
\
1279
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1280
    uint8_t full[24*17];\
1281
    uint8_t half[256];\
1282
    copy_block17(full, src, 24, stride, 17);\
1283
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1284
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1285
}\
1286
\
1287
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1288
    uint8_t full[24*17];\
1289
    copy_block17(full, src, 24, stride, 17);\
1290
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1291
}\
1292
\
1293
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1294
    uint8_t full[24*17];\
1295
    uint8_t half[256];\
1296
    copy_block17(full, src, 24, stride, 17);\
1297
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1298
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1299
}\
1300
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1301
    uint8_t full[24*17];\
1302
    uint8_t halfH[272];\
1303
    uint8_t halfV[256];\
1304
    uint8_t halfHV[256];\
1305
    copy_block17(full, src, 24, stride, 17);\
1306
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1307
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1308
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1309
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1310
}\
1311
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1312
    uint8_t full[24*17];\
1313
    uint8_t halfH[272];\
1314
    uint8_t halfHV[256];\
1315
    copy_block17(full, src, 24, stride, 17);\
1316
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1317
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1318
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1319
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1320
}\
1321
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1322
    uint8_t full[24*17];\
1323
    uint8_t halfH[272];\
1324
    uint8_t halfV[256];\
1325
    uint8_t halfHV[256];\
1326
    copy_block17(full, src, 24, stride, 17);\
1327
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1328
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1329
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1330
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1331
}\
1332
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1333
    uint8_t full[24*17];\
1334
    uint8_t halfH[272];\
1335
    uint8_t halfHV[256];\
1336
    copy_block17(full, src, 24, stride, 17);\
1337
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1338
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1339
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1340
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1341
}\
1342
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1343
    uint8_t full[24*17];\
1344
    uint8_t halfH[272];\
1345
    uint8_t halfV[256];\
1346
    uint8_t halfHV[256];\
1347
    copy_block17(full, src, 24, stride, 17);\
1348
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1349
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1350
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1351
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1352
}\
1353
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1354
    uint8_t full[24*17];\
1355
    uint8_t halfH[272];\
1356
    uint8_t halfHV[256];\
1357
    copy_block17(full, src, 24, stride, 17);\
1358
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1359
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1360
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1361
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1362
}\
1363
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1364
    uint8_t full[24*17];\
1365
    uint8_t halfH[272];\
1366
    uint8_t halfV[256];\
1367
    uint8_t halfHV[256];\
1368
    copy_block17(full, src, 24, stride, 17);\
1369
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1370
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1371
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1372
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1373
}\
1374
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1375
    uint8_t full[24*17];\
1376
    uint8_t halfH[272];\
1377
    uint8_t halfHV[256];\
1378
    copy_block17(full, src, 24, stride, 17);\
1379
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1380
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1381
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1382
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1383
}\
1384
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1385
    uint8_t halfH[272];\
1386
    uint8_t halfHV[256];\
1387
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1388
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1389
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1390
}\
1391
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1392
    uint8_t halfH[272];\
1393
    uint8_t halfHV[256];\
1394
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1395
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1396
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1397
}\
1398
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1399
    uint8_t full[24*17];\
1400
    uint8_t halfH[272];\
1401
    uint8_t halfV[256];\
1402
    uint8_t halfHV[256];\
1403
    copy_block17(full, src, 24, stride, 17);\
1404
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1405
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1406
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1407
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1408
}\
1409
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1410
    uint8_t full[24*17];\
1411
    uint8_t halfH[272];\
1412
    copy_block17(full, src, 24, stride, 17);\
1413
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1414
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1415
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1416
}\
1417
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1418
    uint8_t full[24*17];\
1419
    uint8_t halfH[272];\
1420
    uint8_t halfV[256];\
1421
    uint8_t halfHV[256];\
1422
    copy_block17(full, src, 24, stride, 17);\
1423
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1424
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1425
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1426
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1427
}\
1428
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1429
    uint8_t full[24*17];\
1430
    uint8_t halfH[272];\
1431
    copy_block17(full, src, 24, stride, 17);\
1432
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1433
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1434
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1435
}\
1436
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1437
    uint8_t halfH[272];\
1438
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1439
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1440
}
1441

    
1442
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1443
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1444
#define op_put(a, b) a = cm[((b) + 16)>>5]
1445
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1446

    
1447
QPEL_MC(0, put_       , _       , op_put)
1448
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1449
QPEL_MC(0, avg_       , _       , op_avg)
1450
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1451
#undef op_avg
1452
#undef op_avg_no_rnd
1453
#undef op_put
1454
#undef op_put_no_rnd
1455

    
1456
#if 1
1457
#define H264_LOWPASS(OPNAME, OP, OP2) \
1458
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1459
    const int h=4;\
1460
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1461
    int i;\
1462
    for(i=0; i<h; i++)\
1463
    {\
1464
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1465
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1466
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1467
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1468
        dst+=dstStride;\
1469
        src+=srcStride;\
1470
    }\
1471
}\
1472
\
1473
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1474
    const int w=4;\
1475
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1476
    int i;\
1477
    for(i=0; i<w; i++)\
1478
    {\
1479
        const int srcB= src[-2*srcStride];\
1480
        const int srcA= src[-1*srcStride];\
1481
        const int src0= src[0 *srcStride];\
1482
        const int src1= src[1 *srcStride];\
1483
        const int src2= src[2 *srcStride];\
1484
        const int src3= src[3 *srcStride];\
1485
        const int src4= src[4 *srcStride];\
1486
        const int src5= src[5 *srcStride];\
1487
        const int src6= src[6 *srcStride];\
1488
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1489
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1490
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1491
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1492
        dst++;\
1493
        src++;\
1494
    }\
1495
}\
1496
\
1497
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1498
    const int h=4;\
1499
    const int w=4;\
1500
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1501
    int i;\
1502
    src -= 2*srcStride;\
1503
    for(i=0; i<h+5; i++)\
1504
    {\
1505
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1506
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1507
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1508
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1509
        tmp+=tmpStride;\
1510
        src+=srcStride;\
1511
    }\
1512
    tmp -= tmpStride*(h+5-2);\
1513
    for(i=0; i<w; i++)\
1514
    {\
1515
        const int tmpB= tmp[-2*tmpStride];\
1516
        const int tmpA= tmp[-1*tmpStride];\
1517
        const int tmp0= tmp[0 *tmpStride];\
1518
        const int tmp1= tmp[1 *tmpStride];\
1519
        const int tmp2= tmp[2 *tmpStride];\
1520
        const int tmp3= tmp[3 *tmpStride];\
1521
        const int tmp4= tmp[4 *tmpStride];\
1522
        const int tmp5= tmp[5 *tmpStride];\
1523
        const int tmp6= tmp[6 *tmpStride];\
1524
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1525
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1526
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1527
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1528
        dst++;\
1529
        tmp++;\
1530
    }\
1531
}\
1532
\
1533
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1534
    const int h=8;\
1535
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1536
    int i;\
1537
    for(i=0; i<h; i++)\
1538
    {\
1539
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1540
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1541
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1542
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1543
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1544
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1545
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1546
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1547
        dst+=dstStride;\
1548
        src+=srcStride;\
1549
    }\
1550
}\
1551
\
1552
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1553
    const int w=8;\
1554
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1555
    int i;\
1556
    for(i=0; i<w; i++)\
1557
    {\
1558
        const int srcB= src[-2*srcStride];\
1559
        const int srcA= src[-1*srcStride];\
1560
        const int src0= src[0 *srcStride];\
1561
        const int src1= src[1 *srcStride];\
1562
        const int src2= src[2 *srcStride];\
1563
        const int src3= src[3 *srcStride];\
1564
        const int src4= src[4 *srcStride];\
1565
        const int src5= src[5 *srcStride];\
1566
        const int src6= src[6 *srcStride];\
1567
        const int src7= src[7 *srcStride];\
1568
        const int src8= src[8 *srcStride];\
1569
        const int src9= src[9 *srcStride];\
1570
        const int src10=src[10*srcStride];\
1571
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1572
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1573
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1574
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1575
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1576
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1577
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1578
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1579
        dst++;\
1580
        src++;\
1581
    }\
1582
}\
1583
\
1584
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1585
    const int h=8;\
1586
    const int w=8;\
1587
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1588
    int i;\
1589
    src -= 2*srcStride;\
1590
    for(i=0; i<h+5; i++)\
1591
    {\
1592
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1593
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1594
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1595
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1596
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1597
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1598
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1599
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1600
        tmp+=tmpStride;\
1601
        src+=srcStride;\
1602
    }\
1603
    tmp -= tmpStride*(h+5-2);\
1604
    for(i=0; i<w; i++)\
1605
    {\
1606
        const int tmpB= tmp[-2*tmpStride];\
1607
        const int tmpA= tmp[-1*tmpStride];\
1608
        const int tmp0= tmp[0 *tmpStride];\
1609
        const int tmp1= tmp[1 *tmpStride];\
1610
        const int tmp2= tmp[2 *tmpStride];\
1611
        const int tmp3= tmp[3 *tmpStride];\
1612
        const int tmp4= tmp[4 *tmpStride];\
1613
        const int tmp5= tmp[5 *tmpStride];\
1614
        const int tmp6= tmp[6 *tmpStride];\
1615
        const int tmp7= tmp[7 *tmpStride];\
1616
        const int tmp8= tmp[8 *tmpStride];\
1617
        const int tmp9= tmp[9 *tmpStride];\
1618
        const int tmp10=tmp[10*tmpStride];\
1619
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1620
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1621
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1622
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1623
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1624
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1625
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1626
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1627
        dst++;\
1628
        tmp++;\
1629
    }\
1630
}\
1631
\
1632
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1633
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1634
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1635
    src += 8*srcStride;\
1636
    dst += 8*dstStride;\
1637
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1638
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1639
}\
1640
\
1641
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1643
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1644
    src += 8*srcStride;\
1645
    dst += 8*dstStride;\
1646
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1647
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1648
}\
1649
\
1650
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1651
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1652
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1653
    src += 8*srcStride;\
1654
    tmp += 8*tmpStride;\
1655
    dst += 8*dstStride;\
1656
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1657
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1658
}\
1659

    
1660
#define H264_MC(OPNAME, SIZE) \
1661
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1662
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1663
}\
1664
\
1665
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1666
    uint8_t half[SIZE*SIZE];\
1667
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1668
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1669
}\
1670
\
1671
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1672
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1673
}\
1674
\
1675
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1676
    uint8_t half[SIZE*SIZE];\
1677
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1678
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1679
}\
1680
\
1681
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1682
    uint8_t full[SIZE*(SIZE+5)];\
1683
    uint8_t * const full_mid= full + SIZE*2;\
1684
    uint8_t half[SIZE*SIZE];\
1685
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1686
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1687
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1688
}\
1689
\
1690
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1691
    uint8_t full[SIZE*(SIZE+5)];\
1692
    uint8_t * const full_mid= full + SIZE*2;\
1693
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1694
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1695
}\
1696
\
1697
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1698
    uint8_t full[SIZE*(SIZE+5)];\
1699
    uint8_t * const full_mid= full + SIZE*2;\
1700
    uint8_t half[SIZE*SIZE];\
1701
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1702
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1703
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1704
}\
1705
\
1706
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1707
    uint8_t full[SIZE*(SIZE+5)];\
1708
    uint8_t * const full_mid= full + SIZE*2;\
1709
    uint8_t halfH[SIZE*SIZE];\
1710
    uint8_t halfV[SIZE*SIZE];\
1711
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1712
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1713
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1714
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1715
}\
1716
\
1717
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1718
    uint8_t full[SIZE*(SIZE+5)];\
1719
    uint8_t * const full_mid= full + SIZE*2;\
1720
    uint8_t halfH[SIZE*SIZE];\
1721
    uint8_t halfV[SIZE*SIZE];\
1722
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1723
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1724
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1725
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1726
}\
1727
\
1728
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729
    uint8_t full[SIZE*(SIZE+5)];\
1730
    uint8_t * const full_mid= full + SIZE*2;\
1731
    uint8_t halfH[SIZE*SIZE];\
1732
    uint8_t halfV[SIZE*SIZE];\
1733
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1734
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1735
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1736
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1737
}\
1738
\
1739
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1740
    uint8_t full[SIZE*(SIZE+5)];\
1741
    uint8_t * const full_mid= full + SIZE*2;\
1742
    uint8_t halfH[SIZE*SIZE];\
1743
    uint8_t halfV[SIZE*SIZE];\
1744
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1745
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1746
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1747
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1748
}\
1749
\
1750
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1751
    int16_t tmp[SIZE*(SIZE+5)];\
1752
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1753
}\
1754
\
1755
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1756
    int16_t tmp[SIZE*(SIZE+5)];\
1757
    uint8_t halfH[SIZE*SIZE];\
1758
    uint8_t halfHV[SIZE*SIZE];\
1759
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1760
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1761
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1762
}\
1763
\
1764
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1765
    int16_t tmp[SIZE*(SIZE+5)];\
1766
    uint8_t halfH[SIZE*SIZE];\
1767
    uint8_t halfHV[SIZE*SIZE];\
1768
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1769
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1770
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1771
}\
1772
\
1773
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1774
    uint8_t full[SIZE*(SIZE+5)];\
1775
    uint8_t * const full_mid= full + SIZE*2;\
1776
    int16_t tmp[SIZE*(SIZE+5)];\
1777
    uint8_t halfV[SIZE*SIZE];\
1778
    uint8_t halfHV[SIZE*SIZE];\
1779
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1780
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1781
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1782
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1783
}\
1784
\
1785
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1786
    uint8_t full[SIZE*(SIZE+5)];\
1787
    uint8_t * const full_mid= full + SIZE*2;\
1788
    int16_t tmp[SIZE*(SIZE+5)];\
1789
    uint8_t halfV[SIZE*SIZE];\
1790
    uint8_t halfHV[SIZE*SIZE];\
1791
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1792
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1793
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1794
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1795
}\
1796

    
1797
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1798
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1799
#define op_put(a, b)  a = cm[((b) + 16)>>5]
1800
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1801
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
1802

    
1803
H264_LOWPASS(put_       , op_put, op2_put)
1804
H264_LOWPASS(avg_       , op_avg, op2_avg)
1805
H264_MC(put_, 4)
1806
H264_MC(put_, 8)
1807
H264_MC(put_, 16)
1808
H264_MC(avg_, 4)
1809
H264_MC(avg_, 8)
1810
H264_MC(avg_, 16)
1811

    
1812
#undef op_avg
1813
#undef op_put
1814
#undef op2_avg
1815
#undef op2_put
1816
#endif
1817

    
1818
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1819
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
1820
    int i;
1821

    
1822
    for(i=0; i<h; i++){
1823
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1824
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1825
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1826
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1827
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1828
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1829
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1830
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1831
        dst+=dstStride;
1832
        src+=srcStride;        
1833
    }
1834
}
1835

    
1836
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1837
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
1838
    int i;
1839

    
1840
    for(i=0; i<w; i++){
1841
        const int src_1= src[ -srcStride];
1842
        const int src0 = src[0          ];
1843
        const int src1 = src[  srcStride];
1844
        const int src2 = src[2*srcStride];
1845
        const int src3 = src[3*srcStride];
1846
        const int src4 = src[4*srcStride];
1847
        const int src5 = src[5*srcStride];
1848
        const int src6 = src[6*srcStride];
1849
        const int src7 = src[7*srcStride];
1850
        const int src8 = src[8*srcStride];
1851
        const int src9 = src[9*srcStride];
1852
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1853
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1854
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1855
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1856
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1857
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1858
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1859
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1860
        src++;
1861
        dst++;
1862
    }
1863
}
1864

    
1865
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1866
    put_pixels8_c(dst, src, stride, 8);
1867
}
1868

    
1869
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1870
    uint8_t half[64];
1871
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1872
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1873
}
1874

    
1875
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1876
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1877
}
1878

    
1879
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1880
    uint8_t half[64];
1881
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1882
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1883
}
1884

    
1885
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1886
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1887
}
1888

    
1889
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1890
    uint8_t halfH[88];
1891
    uint8_t halfV[64];
1892
    uint8_t halfHV[64];
1893
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1894
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1895
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1896
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1897
}
1898
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1899
    uint8_t halfH[88];
1900
    uint8_t halfV[64];
1901
    uint8_t halfHV[64];
1902
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1903
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1904
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1905
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1906
}
1907
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1908
    uint8_t halfH[88];
1909
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1910
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1911
}
1912

    
1913

    
1914
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1915
{
1916
    int s, i;
1917

    
1918
    s = 0;
1919
    for(i=0;i<16;i++) {
1920
        s += abs(pix1[0] - pix2[0]);
1921
        s += abs(pix1[1] - pix2[1]);
1922
        s += abs(pix1[2] - pix2[2]);
1923
        s += abs(pix1[3] - pix2[3]);
1924
        s += abs(pix1[4] - pix2[4]);
1925
        s += abs(pix1[5] - pix2[5]);
1926
        s += abs(pix1[6] - pix2[6]);
1927
        s += abs(pix1[7] - pix2[7]);
1928
        s += abs(pix1[8] - pix2[8]);
1929
        s += abs(pix1[9] - pix2[9]);
1930
        s += abs(pix1[10] - pix2[10]);
1931
        s += abs(pix1[11] - pix2[11]);
1932
        s += abs(pix1[12] - pix2[12]);
1933
        s += abs(pix1[13] - pix2[13]);
1934
        s += abs(pix1[14] - pix2[14]);
1935
        s += abs(pix1[15] - pix2[15]);
1936
        pix1 += line_size;
1937
        pix2 += line_size;
1938
    }
1939
    return s;
1940
}
1941

    
1942
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1943
{
1944
    int s, i;
1945

    
1946
    s = 0;
1947
    for(i=0;i<16;i++) {
1948
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1949
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1950
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1951
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1952
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1953
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1954
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1955
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1956
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1957
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1958
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1959
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1960
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1961
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1962
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1963
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1964
        pix1 += line_size;
1965
        pix2 += line_size;
1966
    }
1967
    return s;
1968
}
1969

    
1970
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1971
{
1972
    int s, i;
1973
    uint8_t *pix3 = pix2 + line_size;
1974

    
1975
    s = 0;
1976
    for(i=0;i<16;i++) {
1977
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1978
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1979
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1980
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1981
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1982
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1983
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1984
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1985
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1986
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1987
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1988
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1989
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1990
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1991
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1992
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1993
        pix1 += line_size;
1994
        pix2 += line_size;
1995
        pix3 += line_size;
1996
    }
1997
    return s;
1998
}
1999

    
2000
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2001
{
2002
    int s, i;
2003
    uint8_t *pix3 = pix2 + line_size;
2004

    
2005
    s = 0;
2006
    for(i=0;i<16;i++) {
2007
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2008
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2009
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2010
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2011
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2012
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2013
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2014
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2015
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2016
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2017
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2018
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2019
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2020
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2021
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2022
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2023
        pix1 += line_size;
2024
        pix2 += line_size;
2025
        pix3 += line_size;
2026
    }
2027
    return s;
2028
}
2029

    
2030
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2031
{
2032
    int s, i;
2033

    
2034
    s = 0;
2035
    for(i=0;i<8;i++) {
2036
        s += abs(pix1[0] - pix2[0]);
2037
        s += abs(pix1[1] - pix2[1]);
2038
        s += abs(pix1[2] - pix2[2]);
2039
        s += abs(pix1[3] - pix2[3]);
2040
        s += abs(pix1[4] - pix2[4]);
2041
        s += abs(pix1[5] - pix2[5]);
2042
        s += abs(pix1[6] - pix2[6]);
2043
        s += abs(pix1[7] - pix2[7]);
2044
        pix1 += line_size;
2045
        pix2 += line_size;
2046
    }
2047
    return s;
2048
}
2049

    
2050
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2051
{
2052
    int s, i;
2053

    
2054
    s = 0;
2055
    for(i=0;i<8;i++) {
2056
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2057
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2058
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2059
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2060
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2061
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2062
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2063
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2064
        pix1 += line_size;
2065
        pix2 += line_size;
2066
    }
2067
    return s;
2068
}
2069

    
2070
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2071
{
2072
    int s, i;
2073
    uint8_t *pix3 = pix2 + line_size;
2074

    
2075
    s = 0;
2076
    for(i=0;i<8;i++) {
2077
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2078
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2079
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2080
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2081
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2082
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2083
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2084
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2085
        pix1 += line_size;
2086
        pix2 += line_size;
2087
        pix3 += line_size;
2088
    }
2089
    return s;
2090
}
2091

    
2092
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2093
{
2094
    int s, i;
2095
    uint8_t *pix3 = pix2 + line_size;
2096

    
2097
    s = 0;
2098
    for(i=0;i<8;i++) {
2099
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2100
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2101
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2102
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2103
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2104
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2105
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2106
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2107
        pix1 += line_size;
2108
        pix2 += line_size;
2109
        pix3 += line_size;
2110
    }
2111
    return s;
2112
}
2113

    
2114
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2115
    return pix_abs16x16_c(a,b,stride);
2116
}
2117

    
2118
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2119
    return pix_abs8x8_c(a,b,stride);
2120
}
2121

    
2122
/**
2123
 * permutes an 8x8 block.
2124
 * @param block the block which will be permuted according to the given permutation vector
2125
 * @param permutation the permutation vector
2126
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2127
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2128
 *                  (inverse) permutated to scantable order!
2129
 */
2130
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2131
{
2132
    int i;
2133
    DCTELEM temp[64];
2134
    
2135
    if(last<=0) return;
2136
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2137

    
2138
    for(i=0; i<=last; i++){
2139
        const int j= scantable[i];
2140
        temp[j]= block[j];
2141
        block[j]=0;
2142
    }
2143
    
2144
    for(i=0; i<=last; i++){
2145
        const int j= scantable[i];
2146
        const int perm_j= permutation[j];
2147
        block[perm_j]= temp[j];
2148
    }
2149
}
2150

    
2151
/**
2152
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2153
 */
2154
static void clear_blocks_c(DCTELEM *blocks)
2155
{
2156
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2157
}
2158

    
2159
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2160
    int i;
2161
    for(i=0; i+7<w; i+=8){
2162
        dst[i+0] += src[i+0];
2163
        dst[i+1] += src[i+1];
2164
        dst[i+2] += src[i+2];
2165
        dst[i+3] += src[i+3];
2166
        dst[i+4] += src[i+4];
2167
        dst[i+5] += src[i+5];
2168
        dst[i+6] += src[i+6];
2169
        dst[i+7] += src[i+7];
2170
    }
2171
    for(; i<w; i++)
2172
        dst[i+0] += src[i+0];
2173
}
2174

    
2175
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2176
    int i;
2177
    for(i=0; i+7<w; i+=8){
2178
        dst[i+0] = src1[i+0]-src2[i+0];
2179
        dst[i+1] = src1[i+1]-src2[i+1];
2180
        dst[i+2] = src1[i+2]-src2[i+2];
2181
        dst[i+3] = src1[i+3]-src2[i+3];
2182
        dst[i+4] = src1[i+4]-src2[i+4];
2183
        dst[i+5] = src1[i+5]-src2[i+5];
2184
        dst[i+6] = src1[i+6]-src2[i+6];
2185
        dst[i+7] = src1[i+7]-src2[i+7];
2186
    }
2187
    for(; i<w; i++)
2188
        dst[i+0] = src1[i+0]-src2[i+0];
2189
}
2190

    
2191
#define BUTTERFLY2(o1,o2,i1,i2) \
2192
o1= (i1)+(i2);\
2193
o2= (i1)-(i2);
2194

    
2195
#define BUTTERFLY1(x,y) \
2196
{\
2197
    int a,b;\
2198
    a= x;\
2199
    b= y;\
2200
    x= a+b;\
2201
    y= a-b;\
2202
}
2203

    
2204
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2205

    
2206
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2207
    int i;
2208
    int temp[64];
2209
    int sum=0;
2210

    
2211
    for(i=0; i<8; i++){
2212
        //FIXME try pointer walks
2213
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2214
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2215
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2216
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2217
        
2218
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2219
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2220
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2221
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2222
        
2223
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2224
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2225
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2226
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2227
    }
2228

    
2229
    for(i=0; i<8; i++){
2230
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2231
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2232
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2233
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2234
        
2235
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2236
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2237
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2238
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2239

    
2240
        sum += 
2241
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2242
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2243
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2244
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2245
    }
2246
#if 0
2247
static int maxi=0;
2248
if(sum>maxi){
2249
    maxi=sum;
2250
    printf("MAX:%d\n", maxi);
2251
}
2252
#endif
2253
    return sum;
2254
}
2255

    
2256
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2257
    int i;
2258
    int temp[64];
2259
    int sum=0;
2260
//FIXME OOOPS ignore 0 term instead of mean mess
2261
    for(i=0; i<8; i++){
2262
        //FIXME try pointer walks
2263
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2264
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2265
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2266
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2267
        
2268
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2269
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2270
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2271
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2272
        
2273
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2274
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2275
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2276
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2277
    }
2278

    
2279
    for(i=0; i<8; i++){
2280
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2281
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2282
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2283
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2284
        
2285
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2286
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2287
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2288
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2289
    
2290
        sum += 
2291
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2292
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2293
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2294
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2295
    }
2296
    
2297
    return sum;
2298
}
2299

    
2300
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2301
    MpegEncContext * const s= (MpegEncContext *)c;
2302
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2303
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2304
    int sum=0, i;
2305

    
2306
    s->dsp.diff_pixels(temp, src1, src2, stride);
2307
    s->dsp.fdct(temp);
2308

    
2309
    for(i=0; i<64; i++)
2310
        sum+= ABS(temp[i]);
2311
        
2312
    return sum;
2313
}
2314

    
2315
void simple_idct(DCTELEM *block); //FIXME
2316

    
2317
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2318
    MpegEncContext * const s= (MpegEncContext *)c;
2319
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2320
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2321
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2322
    int sum=0, i;
2323

    
2324
    s->mb_intra=0;
2325
    
2326
    s->dsp.diff_pixels(temp, src1, src2, stride);
2327
    
2328
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2329
    
2330
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2331
    s->dct_unquantize(s, temp, 0, s->qscale);
2332
    simple_idct(temp); //FIXME 
2333
    
2334
    for(i=0; i<64; i++)
2335
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2336
        
2337
    return sum;
2338
}
2339

    
2340
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2341
    MpegEncContext * const s= (MpegEncContext *)c;
2342
    const uint8_t *scantable= s->intra_scantable.permutated;
2343
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2344
    uint64_t __align8 aligned_bak[stride];
2345
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2346
    uint8_t * const bak= (uint8_t*)aligned_bak;
2347
    int i, last, run, bits, level, distoration, start_i;
2348
    const int esc_length= s->ac_esc_length;
2349
    uint8_t * length;
2350
    uint8_t * last_length;
2351
    
2352
    for(i=0; i<8; i++){
2353
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2354
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2355
    }
2356

    
2357
    s->dsp.diff_pixels(temp, src1, src2, stride);
2358

    
2359
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2360

    
2361
    bits=0;
2362
    
2363
    if (s->mb_intra) {
2364
        start_i = 1; 
2365
        length     = s->intra_ac_vlc_length;
2366
        last_length= s->intra_ac_vlc_last_length;
2367
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2368
    } else {
2369
        start_i = 0;
2370
        length     = s->inter_ac_vlc_length;
2371
        last_length= s->inter_ac_vlc_last_length;
2372
    }
2373
    
2374
    if(last>=start_i){
2375
        run=0;
2376
        for(i=start_i; i<last; i++){
2377
            int j= scantable[i];
2378
            level= temp[j];
2379
        
2380
            if(level){
2381
                level+=64;
2382
                if((level&(~127)) == 0){
2383
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2384
                }else
2385
                    bits+= esc_length;
2386
                run=0;
2387
            }else
2388
                run++;
2389
        }
2390
        i= scantable[last];
2391
       
2392
        level= temp[i] + 64;
2393

    
2394
        assert(level - 64);
2395
        
2396
        if((level&(~127)) == 0){
2397
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2398
        }else
2399
            bits+= esc_length;
2400
    
2401
    }
2402

    
2403
    if(last>=0){
2404
        s->dct_unquantize(s, temp, 0, s->qscale);
2405
    }
2406
    
2407
    s->dsp.idct_add(bak, stride, temp);
2408
    
2409
    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2410

    
2411
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2412
}
2413

    
2414
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2415
    MpegEncContext * const s= (MpegEncContext *)c;
2416
    const uint8_t *scantable= s->intra_scantable.permutated;
2417
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2418
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2419
    int i, last, run, bits, level, start_i;
2420
    const int esc_length= s->ac_esc_length;
2421
    uint8_t * length;
2422
    uint8_t * last_length;
2423
    
2424
    s->dsp.diff_pixels(temp, src1, src2, stride);
2425

    
2426
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2427

    
2428
    bits=0;
2429
    
2430
    if (s->mb_intra) {
2431
        start_i = 1; 
2432
        length     = s->intra_ac_vlc_length;
2433
        last_length= s->intra_ac_vlc_last_length;
2434
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2435
    } else {
2436
        start_i = 0;
2437
        length     = s->inter_ac_vlc_length;
2438
        last_length= s->inter_ac_vlc_last_length;
2439
    }
2440
    
2441
    if(last>=start_i){
2442
        run=0;
2443
        for(i=start_i; i<last; i++){
2444
            int j= scantable[i];
2445
            level= temp[j];
2446
        
2447
            if(level){
2448
                level+=64;
2449
                if((level&(~127)) == 0){
2450
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2451
                }else
2452
                    bits+= esc_length;
2453
                run=0;
2454
            }else
2455
                run++;
2456
        }
2457
        i= scantable[last];
2458
                
2459
        level= temp[i] + 64;
2460
        
2461
        assert(level - 64);
2462
        
2463
        if((level&(~127)) == 0){
2464
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2465
        }else
2466
            bits+= esc_length;
2467
    }
2468

    
2469
    return bits;
2470
}
2471

    
2472

    
2473
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2474
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2475
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2476
WARPER88_1616(rd8x8_c, rd16x16_c)
2477
WARPER88_1616(bit8x8_c, bit16x16_c)
2478

    
2479
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2480
 converted */
2481
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2482
{
2483
    j_rev_dct (block);
2484
    put_pixels_clamped_c(block, dest, line_size);
2485
}
2486
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2487
{
2488
    j_rev_dct (block);
2489
    add_pixels_clamped_c(block, dest, line_size);
2490
}
2491

    
2492
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2493
{
2494
    static int init_done = 0;
2495
    int i;
2496

    
2497
    if (!init_done) {
2498
        for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2499
        for(i=0;i<MAX_NEG_CROP;i++) {
2500
            cropTbl[i] = 0;
2501
            cropTbl[i + MAX_NEG_CROP + 256] = 255;
2502
        }
2503

    
2504
        for(i=0;i<512;i++) {
2505
            squareTbl[i] = (i - 256) * (i - 256);
2506
        }
2507

    
2508
        for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2509

    
2510
        init_done = 1;
2511
    }
2512

    
2513
#ifdef CONFIG_ENCODERS
2514
    if(avctx->dct_algo==FF_DCT_FASTINT)
2515
        c->fdct = fdct_ifast;
2516
    else
2517
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2518
#endif //CONFIG_ENCODERS
2519

    
2520
    if(avctx->idct_algo==FF_IDCT_INT){
2521
        c->idct_put= ff_jref_idct_put;
2522
        c->idct_add= ff_jref_idct_add;
2523
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2524
    }else{ //accurate/default
2525
        c->idct_put= simple_idct_put;
2526
        c->idct_add= simple_idct_add;
2527
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2528
    }
2529

    
2530
    c->get_pixels = get_pixels_c;
2531
    c->diff_pixels = diff_pixels_c;
2532
    c->put_pixels_clamped = put_pixels_clamped_c;
2533
    c->add_pixels_clamped = add_pixels_clamped_c;
2534
    c->gmc1 = gmc1_c;
2535
    c->gmc = gmc_c;
2536
    c->clear_blocks = clear_blocks_c;
2537
    c->pix_sum = pix_sum_c;
2538
    c->pix_norm1 = pix_norm1_c;
2539
    c->sse[0]= sse16_c;
2540
    c->sse[1]= sse8_c;
2541

    
2542
    /* TODO [0] 16  [1] 8 */
2543
    c->pix_abs16x16     = pix_abs16x16_c;
2544
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2545
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2546
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2547
    c->pix_abs8x8     = pix_abs8x8_c;
2548
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2549
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2550
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2551

    
2552
#define dspfunc(PFX, IDX, NUM) \
2553
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2554
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2555
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2556
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2557

    
2558
    dspfunc(put, 0, 16);
2559
    dspfunc(put_no_rnd, 0, 16);
2560
    dspfunc(put, 1, 8);
2561
    dspfunc(put_no_rnd, 1, 8);
2562

    
2563
    dspfunc(avg, 0, 16);
2564
    dspfunc(avg_no_rnd, 0, 16);
2565
    dspfunc(avg, 1, 8);
2566
    dspfunc(avg_no_rnd, 1, 8);
2567
#undef dspfunc
2568

    
2569
#define dspfunc(PFX, IDX, NUM) \
2570
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2571
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2572
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2573
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2574
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2575
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2576
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2577
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2578
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2579
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2580
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2581
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2582
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2583
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2584
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2585
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2586

    
2587
    dspfunc(put_qpel, 0, 16);
2588
    dspfunc(put_no_rnd_qpel, 0, 16);
2589

    
2590
    dspfunc(avg_qpel, 0, 16);
2591
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2592

    
2593
    dspfunc(put_qpel, 1, 8);
2594
    dspfunc(put_no_rnd_qpel, 1, 8);
2595

    
2596
    dspfunc(avg_qpel, 1, 8);
2597
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2598

    
2599
    dspfunc(put_h264_qpel, 0, 16);
2600
    dspfunc(put_h264_qpel, 1, 8);
2601
    dspfunc(put_h264_qpel, 2, 4);
2602
    dspfunc(avg_h264_qpel, 0, 16);
2603
    dspfunc(avg_h264_qpel, 1, 8);
2604
    dspfunc(avg_h264_qpel, 2, 4);
2605

    
2606
#undef dspfunc
2607
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2608
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2609
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2610
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2611
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2612
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2613

    
2614
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2615
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2616
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2617
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2618
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2619
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2620
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2621
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2622
    
2623
    c->hadamard8_diff[0]= hadamard8_diff16_c;
2624
    c->hadamard8_diff[1]= hadamard8_diff_c;
2625
    c->hadamard8_abs = hadamard8_abs_c;
2626
    
2627
    c->dct_sad[0]= dct_sad16x16_c;
2628
    c->dct_sad[1]= dct_sad8x8_c;
2629
    
2630
    c->sad[0]= sad16x16_c;
2631
    c->sad[1]= sad8x8_c;
2632
    
2633
    c->quant_psnr[0]= quant_psnr16x16_c;
2634
    c->quant_psnr[1]= quant_psnr8x8_c;
2635

    
2636
    c->rd[0]= rd16x16_c;
2637
    c->rd[1]= rd8x8_c;
2638

    
2639
    c->bit[0]= bit16x16_c;
2640
    c->bit[1]= bit8x8_c;
2641
        
2642
    c->add_bytes= add_bytes_c;
2643
    c->diff_bytes= diff_bytes_c;
2644

    
2645
#ifdef HAVE_MMX
2646
    dsputil_init_mmx(c, avctx);
2647
#endif
2648
#ifdef ARCH_ARMV4L
2649
    dsputil_init_armv4l(c, avctx);
2650
#endif
2651
#ifdef HAVE_MLIB
2652
    dsputil_init_mlib(c, avctx);
2653
#endif
2654
#ifdef ARCH_ALPHA
2655
    dsputil_init_alpha(c, avctx);
2656
#endif
2657
#ifdef ARCH_POWERPC
2658
    dsputil_init_ppc(c, avctx);
2659
#endif
2660
#ifdef HAVE_MMI
2661
    dsputil_init_mmi(c, avctx);
2662
#endif
2663

    
2664
    switch(c->idct_permutation_type){
2665
    case FF_NO_IDCT_PERM:
2666
        for(i=0; i<64; i++)
2667
            c->idct_permutation[i]= i;
2668
        break;
2669
    case FF_LIBMPEG2_IDCT_PERM:
2670
        for(i=0; i<64; i++)
2671
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2672
        break;
2673
    case FF_SIMPLE_IDCT_PERM:
2674
        for(i=0; i<64; i++)
2675
            c->idct_permutation[i]= simple_mmx_permutation[i];
2676
        break;
2677
    case FF_TRANSPOSE_IDCT_PERM:
2678
        for(i=0; i<64; i++)
2679
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2680
        break;
2681
    default:
2682
        fprintf(stderr, "Internal error, IDCT permutation not set\n");
2683
    }
2684
}
2685