Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 669ac79c

History | View | Annotate | Download (110 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27
#include "avcodec.h"
28
#include "dsputil.h"
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
31

    
32

    
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35

    
36
const uint8_t ff_zigzag_direct[64] = {
37
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39
    12, 19, 26, 33, 40, 48, 41, 34,
40
    27, 20, 13,  6,  7, 14, 21, 28,
41
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46

    
47
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48
uint16_t __align8 inv_zigzag_direct16[64];
49

    
50
const uint8_t ff_alternate_horizontal_scan[64] = {
51
    0,  1,   2,  3,  8,  9, 16, 17, 
52
    10, 11,  4,  5,  6,  7, 15, 14,
53
    13, 12, 19, 18, 24, 25, 32, 33, 
54
    26, 27, 20, 21, 22, 23, 28, 29,
55
    30, 31, 34, 35, 40, 41, 48, 49, 
56
    42, 43, 36, 37, 38, 39, 44, 45,
57
    46, 47, 50, 51, 56, 57, 58, 59, 
58
    52, 53, 54, 55, 60, 61, 62, 63,
59
};
60

    
61
const uint8_t ff_alternate_vertical_scan[64] = {
62
    0,  8,  16, 24,  1,  9,  2, 10, 
63
    17, 25, 32, 40, 48, 56, 57, 49,
64
    41, 33, 26, 18,  3, 11,  4, 12, 
65
    19, 27, 34, 42, 50, 58, 35, 43,
66
    51, 59, 20, 28,  5, 13,  6, 14, 
67
    21, 29, 36, 44, 52, 60, 37, 45,
68
    53, 61, 22, 30,  7, 15, 23, 31, 
69
    38, 46, 54, 62, 39, 47, 55, 63,
70
};
71

    
72
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73
const uint32_t inverse[256]={
74
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
75
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
76
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
77
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
78
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
79
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
80
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
81
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
82
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
83
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
84
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
85
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
86
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
87
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
88
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
89
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
90
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
91
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
92
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
93
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
94
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
95
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
96
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
97
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
98
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
99
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
100
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
101
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
102
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
103
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
104
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
105
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
106
};
107

    
108
/* Input permutation for the simple_idct_mmx */
109
static const uint8_t simple_mmx_permutation[64]={
110
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
111
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
112
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
113
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
114
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
115
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
116
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
117
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118
};
119

    
120
static int pix_sum_c(uint8_t * pix, int line_size)
121
{
122
    int s, i, j;
123

    
124
    s = 0;
125
    for (i = 0; i < 16; i++) {
126
        for (j = 0; j < 16; j += 8) {
127
            s += pix[0];
128
            s += pix[1];
129
            s += pix[2];
130
            s += pix[3];
131
            s += pix[4];
132
            s += pix[5];
133
            s += pix[6];
134
            s += pix[7];
135
            pix += 8;
136
        }
137
        pix += line_size - 16;
138
    }
139
    return s;
140
}
141

    
142
static int pix_norm1_c(uint8_t * pix, int line_size)
143
{
144
    int s, i, j;
145
    uint32_t *sq = squareTbl + 256;
146

    
147
    s = 0;
148
    for (i = 0; i < 16; i++) {
149
        for (j = 0; j < 16; j += 8) {
150
#if 0
151
            s += sq[pix[0]];
152
            s += sq[pix[1]];
153
            s += sq[pix[2]];
154
            s += sq[pix[3]];
155
            s += sq[pix[4]];
156
            s += sq[pix[5]];
157
            s += sq[pix[6]];
158
            s += sq[pix[7]];
159
#else
160
#if LONG_MAX > 2147483647
161
            register uint64_t x=*(uint64_t*)pix;
162
            s += sq[x&0xff];
163
            s += sq[(x>>8)&0xff];
164
            s += sq[(x>>16)&0xff];
165
            s += sq[(x>>24)&0xff];
166
            s += sq[(x>>32)&0xff];
167
            s += sq[(x>>40)&0xff];
168
            s += sq[(x>>48)&0xff];
169
            s += sq[(x>>56)&0xff];
170
#else
171
            register uint32_t x=*(uint32_t*)pix;
172
            s += sq[x&0xff];
173
            s += sq[(x>>8)&0xff];
174
            s += sq[(x>>16)&0xff];
175
            s += sq[(x>>24)&0xff];
176
            x=*(uint32_t*)(pix+4);
177
            s += sq[x&0xff];
178
            s += sq[(x>>8)&0xff];
179
            s += sq[(x>>16)&0xff];
180
            s += sq[(x>>24)&0xff];
181
#endif
182
#endif
183
            pix += 8;
184
        }
185
        pix += line_size - 16;
186
    }
187
    return s;
188
}
189

    
190

    
191
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
192
{
193
    int s, i;
194
    uint32_t *sq = squareTbl + 256;
195

    
196
    s = 0;
197
    for (i = 0; i < 8; i++) {
198
        s += sq[pix1[0] - pix2[0]];
199
        s += sq[pix1[1] - pix2[1]];
200
        s += sq[pix1[2] - pix2[2]];
201
        s += sq[pix1[3] - pix2[3]];
202
        s += sq[pix1[4] - pix2[4]];
203
        s += sq[pix1[5] - pix2[5]];
204
        s += sq[pix1[6] - pix2[6]];
205
        s += sq[pix1[7] - pix2[7]];
206
        pix1 += line_size;
207
        pix2 += line_size;
208
    }
209
    return s;
210
}
211

    
212
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
213
{
214
    int s, i;
215
    uint32_t *sq = squareTbl + 256;
216

    
217
    s = 0;
218
    for (i = 0; i < 16; i++) {
219
        s += sq[pix1[ 0] - pix2[ 0]];
220
        s += sq[pix1[ 1] - pix2[ 1]];
221
        s += sq[pix1[ 2] - pix2[ 2]];
222
        s += sq[pix1[ 3] - pix2[ 3]];
223
        s += sq[pix1[ 4] - pix2[ 4]];
224
        s += sq[pix1[ 5] - pix2[ 5]];
225
        s += sq[pix1[ 6] - pix2[ 6]];
226
        s += sq[pix1[ 7] - pix2[ 7]];
227
        s += sq[pix1[ 8] - pix2[ 8]];
228
        s += sq[pix1[ 9] - pix2[ 9]];
229
        s += sq[pix1[10] - pix2[10]];
230
        s += sq[pix1[11] - pix2[11]];
231
        s += sq[pix1[12] - pix2[12]];
232
        s += sq[pix1[13] - pix2[13]];
233
        s += sq[pix1[14] - pix2[14]];
234
        s += sq[pix1[15] - pix2[15]];
235

    
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
243
{
244
    int i;
245

    
246
    /* read the pixels */
247
    for(i=0;i<8;i++) {
248
        block[0] = pixels[0];
249
        block[1] = pixels[1];
250
        block[2] = pixels[2];
251
        block[3] = pixels[3];
252
        block[4] = pixels[4];
253
        block[5] = pixels[5];
254
        block[6] = pixels[6];
255
        block[7] = pixels[7];
256
        pixels += line_size;
257
        block += 8;
258
    }
259
}
260

    
261
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262
                          const uint8_t *s2, int stride){
263
    int i;
264

    
265
    /* read the pixels */
266
    for(i=0;i<8;i++) {
267
        block[0] = s1[0] - s2[0];
268
        block[1] = s1[1] - s2[1];
269
        block[2] = s1[2] - s2[2];
270
        block[3] = s1[3] - s2[3];
271
        block[4] = s1[4] - s2[4];
272
        block[5] = s1[5] - s2[5];
273
        block[6] = s1[6] - s2[6];
274
        block[7] = s1[7] - s2[7];
275
        s1 += stride;
276
        s2 += stride;
277
        block += 8;
278
    }
279
}
280

    
281

    
282
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
283
                                 int line_size)
284
{
285
    int i;
286
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
287
    
288
    /* read the pixels */
289
    for(i=0;i<8;i++) {
290
        pixels[0] = cm[block[0]];
291
        pixels[1] = cm[block[1]];
292
        pixels[2] = cm[block[2]];
293
        pixels[3] = cm[block[3]];
294
        pixels[4] = cm[block[4]];
295
        pixels[5] = cm[block[5]];
296
        pixels[6] = cm[block[6]];
297
        pixels[7] = cm[block[7]];
298

    
299
        pixels += line_size;
300
        block += 8;
301
    }
302
}
303

    
304
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
305
                          int line_size)
306
{
307
    int i;
308
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
309
    
310
    /* read the pixels */
311
    for(i=0;i<8;i++) {
312
        pixels[0] = cm[pixels[0] + block[0]];
313
        pixels[1] = cm[pixels[1] + block[1]];
314
        pixels[2] = cm[pixels[2] + block[2]];
315
        pixels[3] = cm[pixels[3] + block[3]];
316
        pixels[4] = cm[pixels[4] + block[4]];
317
        pixels[5] = cm[pixels[5] + block[5]];
318
        pixels[6] = cm[pixels[6] + block[6]];
319
        pixels[7] = cm[pixels[7] + block[7]];
320
        pixels += line_size;
321
        block += 8;
322
    }
323
}
324
#if 0
325

326
#define PIXOP2(OPNAME, OP) \
327
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
328
{\
329
    int i;\
330
    for(i=0; i<h; i++){\
331
        OP(*((uint64_t*)block), LD64(pixels));\
332
        pixels+=line_size;\
333
        block +=line_size;\
334
    }\
335
}\
336
\
337
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
338
{\
339
    int i;\
340
    for(i=0; i<h; i++){\
341
        const uint64_t a= LD64(pixels  );\
342
        const uint64_t b= LD64(pixels+1);\
343
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
344
        pixels+=line_size;\
345
        block +=line_size;\
346
    }\
347
}\
348
\
349
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
350
{\
351
    int i;\
352
    for(i=0; i<h; i++){\
353
        const uint64_t a= LD64(pixels  );\
354
        const uint64_t b= LD64(pixels+1);\
355
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
356
        pixels+=line_size;\
357
        block +=line_size;\
358
    }\
359
}\
360
\
361
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
362
{\
363
    int i;\
364
    for(i=0; i<h; i++){\
365
        const uint64_t a= LD64(pixels          );\
366
        const uint64_t b= LD64(pixels+line_size);\
367
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
368
        pixels+=line_size;\
369
        block +=line_size;\
370
    }\
371
}\
372
\
373
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
374
{\
375
    int i;\
376
    for(i=0; i<h; i++){\
377
        const uint64_t a= LD64(pixels          );\
378
        const uint64_t b= LD64(pixels+line_size);\
379
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380
        pixels+=line_size;\
381
        block +=line_size;\
382
    }\
383
}\
384
\
385
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
386
{\
387
        int i;\
388
        const uint64_t a= LD64(pixels  );\
389
        const uint64_t b= LD64(pixels+1);\
390
        uint64_t l0=  (a&0x0303030303030303ULL)\
391
                    + (b&0x0303030303030303ULL)\
392
                    + 0x0202020202020202ULL;\
393
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395
        uint64_t l1,h1;\
396
\
397
        pixels+=line_size;\
398
        for(i=0; i<h; i+=2){\
399
            uint64_t a= LD64(pixels  );\
400
            uint64_t b= LD64(pixels+1);\
401
            l1=  (a&0x0303030303030303ULL)\
402
               + (b&0x0303030303030303ULL);\
403
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
406
            pixels+=line_size;\
407
            block +=line_size;\
408
            a= LD64(pixels  );\
409
            b= LD64(pixels+1);\
410
            l0=  (a&0x0303030303030303ULL)\
411
               + (b&0x0303030303030303ULL)\
412
               + 0x0202020202020202ULL;\
413
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
416
            pixels+=line_size;\
417
            block +=line_size;\
418
        }\
419
}\
420
\
421
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
422
{\
423
        int i;\
424
        const uint64_t a= LD64(pixels  );\
425
        const uint64_t b= LD64(pixels+1);\
426
        uint64_t l0=  (a&0x0303030303030303ULL)\
427
                    + (b&0x0303030303030303ULL)\
428
                    + 0x0101010101010101ULL;\
429
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
431
        uint64_t l1,h1;\
432
\
433
        pixels+=line_size;\
434
        for(i=0; i<h; i+=2){\
435
            uint64_t a= LD64(pixels  );\
436
            uint64_t b= LD64(pixels+1);\
437
            l1=  (a&0x0303030303030303ULL)\
438
               + (b&0x0303030303030303ULL);\
439
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
442
            pixels+=line_size;\
443
            block +=line_size;\
444
            a= LD64(pixels  );\
445
            b= LD64(pixels+1);\
446
            l0=  (a&0x0303030303030303ULL)\
447
               + (b&0x0303030303030303ULL)\
448
               + 0x0101010101010101ULL;\
449
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452
            pixels+=line_size;\
453
            block +=line_size;\
454
        }\
455
}\
456
\
457
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
458
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
464

465
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466
#else // 64 bit variant
467

    
468
#define PIXOP2(OPNAME, OP) \
469
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
470
    int i;\
471
    for(i=0; i<h; i++){\
472
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
473
        pixels+=line_size;\
474
        block +=line_size;\
475
    }\
476
}\
477
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
478
    int i;\
479
    for(i=0; i<h; i++){\
480
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
481
        pixels+=line_size;\
482
        block +=line_size;\
483
    }\
484
}\
485
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
486
    int i;\
487
    for(i=0; i<h; i++){\
488
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
489
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
490
        pixels+=line_size;\
491
        block +=line_size;\
492
    }\
493
}\
494
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
495
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
496
}\
497
\
498
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
499
                                                int src_stride1, int src_stride2, int h){\
500
    int i;\
501
    for(i=0; i<h; i++){\
502
        uint32_t a,b;\
503
        a= LD32(&src1[i*src_stride1  ]);\
504
        b= LD32(&src2[i*src_stride2  ]);\
505
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
506
        a= LD32(&src1[i*src_stride1+4]);\
507
        b= LD32(&src2[i*src_stride2+4]);\
508
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
509
    }\
510
}\
511
\
512
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
513
                                                int src_stride1, int src_stride2, int h){\
514
    int i;\
515
    for(i=0; i<h; i++){\
516
        uint32_t a,b;\
517
        a= LD32(&src1[i*src_stride1  ]);\
518
        b= LD32(&src2[i*src_stride2  ]);\
519
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
520
        a= LD32(&src1[i*src_stride1+4]);\
521
        b= LD32(&src2[i*src_stride2+4]);\
522
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
523
    }\
524
}\
525
\
526
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
527
                                                int src_stride1, int src_stride2, int h){\
528
    int i;\
529
    for(i=0; i<h; i++){\
530
        uint32_t a,b;\
531
        a= LD32(&src1[i*src_stride1  ]);\
532
        b= LD32(&src2[i*src_stride2  ]);\
533
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
534
    }\
535
}\
536
\
537
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
538
                                                int src_stride1, int src_stride2, int h){\
539
    int i;\
540
    for(i=0; i<h; i++){\
541
        uint32_t a,b;\
542
        a= LD16(&src1[i*src_stride1  ]);\
543
        b= LD16(&src2[i*src_stride2  ]);\
544
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
545
    }\
546
}\
547
\
548
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
549
                                                int src_stride1, int src_stride2, int h){\
550
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
551
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
552
}\
553
\
554
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
555
                                                int src_stride1, int src_stride2, int h){\
556
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
557
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
558
}\
559
\
560
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
561
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
562
}\
563
\
564
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
565
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
566
}\
567
\
568
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
569
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
570
}\
571
\
572
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
573
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
574
}\
575
\
576
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
577
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
578
    int i;\
579
    for(i=0; i<h; i++){\
580
        uint32_t a, b, c, d, l0, l1, h0, h1;\
581
        a= LD32(&src1[i*src_stride1]);\
582
        b= LD32(&src2[i*src_stride2]);\
583
        c= LD32(&src3[i*src_stride3]);\
584
        d= LD32(&src4[i*src_stride4]);\
585
        l0=  (a&0x03030303UL)\
586
           + (b&0x03030303UL)\
587
           + 0x02020202UL;\
588
        h0= ((a&0xFCFCFCFCUL)>>2)\
589
          + ((b&0xFCFCFCFCUL)>>2);\
590
        l1=  (c&0x03030303UL)\
591
           + (d&0x03030303UL);\
592
        h1= ((c&0xFCFCFCFCUL)>>2)\
593
          + ((d&0xFCFCFCFCUL)>>2);\
594
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
595
        a= LD32(&src1[i*src_stride1+4]);\
596
        b= LD32(&src2[i*src_stride2+4]);\
597
        c= LD32(&src3[i*src_stride3+4]);\
598
        d= LD32(&src4[i*src_stride4+4]);\
599
        l0=  (a&0x03030303UL)\
600
           + (b&0x03030303UL)\
601
           + 0x02020202UL;\
602
        h0= ((a&0xFCFCFCFCUL)>>2)\
603
          + ((b&0xFCFCFCFCUL)>>2);\
604
        l1=  (c&0x03030303UL)\
605
           + (d&0x03030303UL);\
606
        h1= ((c&0xFCFCFCFCUL)>>2)\
607
          + ((d&0xFCFCFCFCUL)>>2);\
608
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
609
    }\
610
}\
611
\
612
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
613
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
614
}\
615
\
616
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
617
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
618
}\
619
\
620
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
621
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
622
}\
623
\
624
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
625
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
626
}\
627
\
628
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
630
    int i;\
631
    for(i=0; i<h; i++){\
632
        uint32_t a, b, c, d, l0, l1, h0, h1;\
633
        a= LD32(&src1[i*src_stride1]);\
634
        b= LD32(&src2[i*src_stride2]);\
635
        c= LD32(&src3[i*src_stride3]);\
636
        d= LD32(&src4[i*src_stride4]);\
637
        l0=  (a&0x03030303UL)\
638
           + (b&0x03030303UL)\
639
           + 0x01010101UL;\
640
        h0= ((a&0xFCFCFCFCUL)>>2)\
641
          + ((b&0xFCFCFCFCUL)>>2);\
642
        l1=  (c&0x03030303UL)\
643
           + (d&0x03030303UL);\
644
        h1= ((c&0xFCFCFCFCUL)>>2)\
645
          + ((d&0xFCFCFCFCUL)>>2);\
646
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647
        a= LD32(&src1[i*src_stride1+4]);\
648
        b= LD32(&src2[i*src_stride2+4]);\
649
        c= LD32(&src3[i*src_stride3+4]);\
650
        d= LD32(&src4[i*src_stride4+4]);\
651
        l0=  (a&0x03030303UL)\
652
           + (b&0x03030303UL)\
653
           + 0x01010101UL;\
654
        h0= ((a&0xFCFCFCFCUL)>>2)\
655
          + ((b&0xFCFCFCFCUL)>>2);\
656
        l1=  (c&0x03030303UL)\
657
           + (d&0x03030303UL);\
658
        h1= ((c&0xFCFCFCFCUL)>>2)\
659
          + ((d&0xFCFCFCFCUL)>>2);\
660
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661
    }\
662
}\
663
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
664
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
665
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
666
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
667
}\
668
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
669
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
670
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
671
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
672
}\
673
\
674
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
675
{\
676
        int i, a0, b0, a1, b1;\
677
        a0= pixels[0];\
678
        b0= pixels[1] + 2;\
679
        a0 += b0;\
680
        b0 += pixels[2];\
681
\
682
        pixels+=line_size;\
683
        for(i=0; i<h; i+=2){\
684
            a1= pixels[0];\
685
            b1= pixels[1];\
686
            a1 += b1;\
687
            b1 += pixels[2];\
688
\
689
            block[0]= (a1+a0)>>2; /* FIXME non put */\
690
            block[1]= (b1+b0)>>2;\
691
\
692
            pixels+=line_size;\
693
            block +=line_size;\
694
\
695
            a0= pixels[0];\
696
            b0= pixels[1] + 2;\
697
            a0 += b0;\
698
            b0 += pixels[2];\
699
\
700
            block[0]= (a1+a0)>>2;\
701
            block[1]= (b1+b0)>>2;\
702
            pixels+=line_size;\
703
            block +=line_size;\
704
        }\
705
}\
706
\
707
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
708
{\
709
        int i;\
710
        const uint32_t a= LD32(pixels  );\
711
        const uint32_t b= LD32(pixels+1);\
712
        uint32_t l0=  (a&0x03030303UL)\
713
                    + (b&0x03030303UL)\
714
                    + 0x02020202UL;\
715
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
716
                   + ((b&0xFCFCFCFCUL)>>2);\
717
        uint32_t l1,h1;\
718
\
719
        pixels+=line_size;\
720
        for(i=0; i<h; i+=2){\
721
            uint32_t a= LD32(pixels  );\
722
            uint32_t b= LD32(pixels+1);\
723
            l1=  (a&0x03030303UL)\
724
               + (b&0x03030303UL);\
725
            h1= ((a&0xFCFCFCFCUL)>>2)\
726
              + ((b&0xFCFCFCFCUL)>>2);\
727
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
728
            pixels+=line_size;\
729
            block +=line_size;\
730
            a= LD32(pixels  );\
731
            b= LD32(pixels+1);\
732
            l0=  (a&0x03030303UL)\
733
               + (b&0x03030303UL)\
734
               + 0x02020202UL;\
735
            h0= ((a&0xFCFCFCFCUL)>>2)\
736
              + ((b&0xFCFCFCFCUL)>>2);\
737
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
738
            pixels+=line_size;\
739
            block +=line_size;\
740
        }\
741
}\
742
\
743
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
744
{\
745
    int j;\
746
    for(j=0; j<2; j++){\
747
        int i;\
748
        const uint32_t a= LD32(pixels  );\
749
        const uint32_t b= LD32(pixels+1);\
750
        uint32_t l0=  (a&0x03030303UL)\
751
                    + (b&0x03030303UL)\
752
                    + 0x02020202UL;\
753
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
754
                   + ((b&0xFCFCFCFCUL)>>2);\
755
        uint32_t l1,h1;\
756
\
757
        pixels+=line_size;\
758
        for(i=0; i<h; i+=2){\
759
            uint32_t a= LD32(pixels  );\
760
            uint32_t b= LD32(pixels+1);\
761
            l1=  (a&0x03030303UL)\
762
               + (b&0x03030303UL);\
763
            h1= ((a&0xFCFCFCFCUL)>>2)\
764
              + ((b&0xFCFCFCFCUL)>>2);\
765
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
766
            pixels+=line_size;\
767
            block +=line_size;\
768
            a= LD32(pixels  );\
769
            b= LD32(pixels+1);\
770
            l0=  (a&0x03030303UL)\
771
               + (b&0x03030303UL)\
772
               + 0x02020202UL;\
773
            h0= ((a&0xFCFCFCFCUL)>>2)\
774
              + ((b&0xFCFCFCFCUL)>>2);\
775
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
776
            pixels+=line_size;\
777
            block +=line_size;\
778
        }\
779
        pixels+=4-line_size*(h+1);\
780
        block +=4-line_size*h;\
781
    }\
782
}\
783
\
784
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785
{\
786
    int j;\
787
    for(j=0; j<2; j++){\
788
        int i;\
789
        const uint32_t a= LD32(pixels  );\
790
        const uint32_t b= LD32(pixels+1);\
791
        uint32_t l0=  (a&0x03030303UL)\
792
                    + (b&0x03030303UL)\
793
                    + 0x01010101UL;\
794
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
795
                   + ((b&0xFCFCFCFCUL)>>2);\
796
        uint32_t l1,h1;\
797
\
798
        pixels+=line_size;\
799
        for(i=0; i<h; i+=2){\
800
            uint32_t a= LD32(pixels  );\
801
            uint32_t b= LD32(pixels+1);\
802
            l1=  (a&0x03030303UL)\
803
               + (b&0x03030303UL);\
804
            h1= ((a&0xFCFCFCFCUL)>>2)\
805
              + ((b&0xFCFCFCFCUL)>>2);\
806
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
807
            pixels+=line_size;\
808
            block +=line_size;\
809
            a= LD32(pixels  );\
810
            b= LD32(pixels+1);\
811
            l0=  (a&0x03030303UL)\
812
               + (b&0x03030303UL)\
813
               + 0x01010101UL;\
814
            h0= ((a&0xFCFCFCFCUL)>>2)\
815
              + ((b&0xFCFCFCFCUL)>>2);\
816
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
817
            pixels+=line_size;\
818
            block +=line_size;\
819
        }\
820
        pixels+=4-line_size*(h+1);\
821
        block +=4-line_size*h;\
822
    }\
823
}\
824
\
825
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
826
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
827
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
828
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
829
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
830
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
831
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
832
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
833

    
834
#define op_avg(a, b) a = rnd_avg32(a, b)
835
#endif
836
#define op_put(a, b) a = b
837

    
838
PIXOP2(avg, op_avg)
839
PIXOP2(put, op_put)
840
#undef op_avg
841
#undef op_put
842

    
843
#define avg2(a,b) ((a+b+1)>>1)
844
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
845

    
846

    
847
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
848
{
849
    const int A=(16-x16)*(16-y16);
850
    const int B=(   x16)*(16-y16);
851
    const int C=(16-x16)*(   y16);
852
    const int D=(   x16)*(   y16);
853
    int i;
854

    
855
    for(i=0; i<h; i++)
856
    {
857
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
858
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
859
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
860
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
861
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
862
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
863
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
864
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
865
        dst+= stride;
866
        src+= stride;
867
    }
868
}
869

    
870
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
871
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
872
{
873
    int y, vx, vy;
874
    const int s= 1<<shift;
875
    
876
    width--;
877
    height--;
878

    
879
    for(y=0; y<h; y++){
880
        int x;
881

    
882
        vx= ox;
883
        vy= oy;
884
        for(x=0; x<8; x++){ //XXX FIXME optimize
885
            int src_x, src_y, frac_x, frac_y, index;
886

    
887
            src_x= vx>>16;
888
            src_y= vy>>16;
889
            frac_x= src_x&(s-1);
890
            frac_y= src_y&(s-1);
891
            src_x>>=shift;
892
            src_y>>=shift;
893
  
894
            if((unsigned)src_x < width){
895
                if((unsigned)src_y < height){
896
                    index= src_x + src_y*stride;
897
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
898
                                           + src[index       +1]*   frac_x )*(s-frac_y)
899
                                        + (  src[index+stride  ]*(s-frac_x)
900
                                           + src[index+stride+1]*   frac_x )*   frac_y
901
                                        + r)>>(shift*2);
902
                }else{
903
                    index= src_x + clip(src_y, 0, height)*stride;                    
904
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
905
                                          + src[index       +1]*   frac_x )*s
906
                                        + r)>>(shift*2);
907
                }
908
            }else{
909
                if((unsigned)src_y < height){
910
                    index= clip(src_x, 0, width) + src_y*stride;                    
911
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
912
                                           + src[index+stride  ]*   frac_y )*s
913
                                        + r)>>(shift*2);
914
                }else{
915
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
916
                    dst[y*stride + x]=    src[index         ];
917
                }
918
            }
919
            
920
            vx+= dxx;
921
            vy+= dyx;
922
        }
923
        ox += dxy;
924
        oy += dyy;
925
    }
926
}
927

    
928
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
929
    switch(width){
930
    case 2: put_pixels2_c (dst, src, stride, height); break;
931
    case 4: put_pixels4_c (dst, src, stride, height); break;
932
    case 8: put_pixels8_c (dst, src, stride, height); break;
933
    case 16:put_pixels16_c(dst, src, stride, height); break;
934
    }
935
}
936

    
937
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
938
    int i,j;
939
    for (i=0; i < height; i++) {
940
      for (j=0; j < width; j++) {
941
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
942
      }
943
      src += stride;
944
      dst += stride;
945
    }
946
}
947

    
948
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
949
    int i,j;
950
    for (i=0; i < height; i++) {
951
      for (j=0; j < width; j++) {
952
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
953
      }
954
      src += stride;
955
      dst += stride;
956
    }
957
}
958
    
959
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
960
    int i,j;
961
    for (i=0; i < height; i++) {
962
      for (j=0; j < width; j++) {
963
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
964
      }
965
      src += stride;
966
      dst += stride;
967
    }
968
}
969
    
970
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
971
    int i,j;
972
    for (i=0; i < height; i++) {
973
      for (j=0; j < width; j++) {
974
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
975
      }
976
      src += stride;
977
      dst += stride;
978
    }
979
}
980

    
981
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
982
    int i,j;
983
    for (i=0; i < height; i++) {
984
      for (j=0; j < width; j++) {
985
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
986
      }
987
      src += stride;
988
      dst += stride;
989
    }
990
}
991

    
992
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
993
    int i,j;
994
    for (i=0; i < height; i++) {
995
      for (j=0; j < width; j++) {
996
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
997
      }
998
      src += stride;
999
      dst += stride;
1000
    }
1001
}
1002

    
1003
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1004
    int i,j;
1005
    for (i=0; i < height; i++) {
1006
      for (j=0; j < width; j++) {
1007
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1008
      }
1009
      src += stride;
1010
      dst += stride;
1011
    }
1012
}
1013

    
1014
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1015
    int i,j;
1016
    for (i=0; i < height; i++) {
1017
      for (j=0; j < width; j++) {
1018
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1019
      }
1020
      src += stride;
1021
      dst += stride;
1022
    }
1023
}
1024
#if 0
1025
#define TPEL_WIDTH(width)\
1026
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1027
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1028
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1029
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1030
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1031
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1032
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1033
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1034
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1035
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1036
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1037
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1038
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1039
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1040
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1041
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1042
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1043
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1044
#endif
1045

    
1046
#define H264_CHROMA_MC(OPNAME, OP)\
1047
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1048
    const int A=(8-x)*(8-y);\
1049
    const int B=(  x)*(8-y);\
1050
    const int C=(8-x)*(  y);\
1051
    const int D=(  x)*(  y);\
1052
    int i;\
1053
    \
1054
    assert(x<8 && y<8 && x>=0 && y>=0);\
1055
\
1056
    for(i=0; i<h; i++)\
1057
    {\
1058
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1059
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1060
        dst+= stride;\
1061
        src+= stride;\
1062
    }\
1063
}\
1064
\
1065
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1066
    const int A=(8-x)*(8-y);\
1067
    const int B=(  x)*(8-y);\
1068
    const int C=(8-x)*(  y);\
1069
    const int D=(  x)*(  y);\
1070
    int i;\
1071
    \
1072
    assert(x<8 && y<8 && x>=0 && y>=0);\
1073
\
1074
    for(i=0; i<h; i++)\
1075
    {\
1076
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1077
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1078
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1079
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1080
        dst+= stride;\
1081
        src+= stride;\
1082
    }\
1083
}\
1084
\
1085
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1086
    const int A=(8-x)*(8-y);\
1087
    const int B=(  x)*(8-y);\
1088
    const int C=(8-x)*(  y);\
1089
    const int D=(  x)*(  y);\
1090
    int i;\
1091
    \
1092
    assert(x<8 && y<8 && x>=0 && y>=0);\
1093
\
1094
    for(i=0; i<h; i++)\
1095
    {\
1096
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1097
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1098
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1099
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1100
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1101
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1102
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1103
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1104
        dst+= stride;\
1105
        src+= stride;\
1106
    }\
1107
}
1108

    
1109
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1110
#define op_put(a, b) a = (((b) + 32)>>6)
1111

    
1112
H264_CHROMA_MC(put_       , op_put)
1113
H264_CHROMA_MC(avg_       , op_avg)
1114
#undef op_avg
1115
#undef op_put
1116

    
1117
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1118
{
1119
    int i;
1120
    for(i=0; i<h; i++)
1121
    {
1122
        ST32(dst   , LD32(src   ));
1123
        dst+=dstStride;
1124
        src+=srcStride;
1125
    }
1126
}
1127

    
1128
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1129
{
1130
    int i;
1131
    for(i=0; i<h; i++)
1132
    {
1133
        ST32(dst   , LD32(src   ));
1134
        ST32(dst+4 , LD32(src+4 ));
1135
        dst+=dstStride;
1136
        src+=srcStride;
1137
    }
1138
}
1139

    
1140
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1141
{
1142
    int i;
1143
    for(i=0; i<h; i++)
1144
    {
1145
        ST32(dst   , LD32(src   ));
1146
        ST32(dst+4 , LD32(src+4 ));
1147
        ST32(dst+8 , LD32(src+8 ));
1148
        ST32(dst+12, LD32(src+12));
1149
        dst+=dstStride;
1150
        src+=srcStride;
1151
    }
1152
}
1153

    
1154
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1155
{
1156
    int i;
1157
    for(i=0; i<h; i++)
1158
    {
1159
        ST32(dst   , LD32(src   ));
1160
        ST32(dst+4 , LD32(src+4 ));
1161
        ST32(dst+8 , LD32(src+8 ));
1162
        ST32(dst+12, LD32(src+12));
1163
        dst[16]= src[16];
1164
        dst+=dstStride;
1165
        src+=srcStride;
1166
    }
1167
}
1168

    
1169
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1170
{
1171
    int i;
1172
    for(i=0; i<h; i++)
1173
    {
1174
        ST32(dst   , LD32(src   ));
1175
        ST32(dst+4 , LD32(src+4 ));
1176
        dst[8]= src[8];
1177
        dst+=dstStride;
1178
        src+=srcStride;
1179
    }
1180
}
1181

    
1182

    
1183
#define QPEL_MC(r, OPNAME, RND, OP) \
1184
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1185
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1186
    int i;\
1187
    for(i=0; i<h; i++)\
1188
    {\
1189
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1190
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1191
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1192
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1193
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1194
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1195
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1196
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1197
        dst+=dstStride;\
1198
        src+=srcStride;\
1199
    }\
1200
}\
1201
\
1202
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1203
    const int w=8;\
1204
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1205
    int i;\
1206
    for(i=0; i<w; i++)\
1207
    {\
1208
        const int src0= src[0*srcStride];\
1209
        const int src1= src[1*srcStride];\
1210
        const int src2= src[2*srcStride];\
1211
        const int src3= src[3*srcStride];\
1212
        const int src4= src[4*srcStride];\
1213
        const int src5= src[5*srcStride];\
1214
        const int src6= src[6*srcStride];\
1215
        const int src7= src[7*srcStride];\
1216
        const int src8= src[8*srcStride];\
1217
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1218
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1219
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1220
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1221
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1222
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1223
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1224
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1225
        dst++;\
1226
        src++;\
1227
    }\
1228
}\
1229
\
1230
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1231
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1232
    int i;\
1233
    \
1234
    for(i=0; i<h; i++)\
1235
    {\
1236
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1237
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1238
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1239
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1240
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1241
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1242
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1243
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1244
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1245
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1246
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1247
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1248
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1249
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1250
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1251
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1252
        dst+=dstStride;\
1253
        src+=srcStride;\
1254
    }\
1255
}\
1256
\
1257
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1258
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1259
    int i;\
1260
    const int w=16;\
1261
    for(i=0; i<w; i++)\
1262
    {\
1263
        const int src0= src[0*srcStride];\
1264
        const int src1= src[1*srcStride];\
1265
        const int src2= src[2*srcStride];\
1266
        const int src3= src[3*srcStride];\
1267
        const int src4= src[4*srcStride];\
1268
        const int src5= src[5*srcStride];\
1269
        const int src6= src[6*srcStride];\
1270
        const int src7= src[7*srcStride];\
1271
        const int src8= src[8*srcStride];\
1272
        const int src9= src[9*srcStride];\
1273
        const int src10= src[10*srcStride];\
1274
        const int src11= src[11*srcStride];\
1275
        const int src12= src[12*srcStride];\
1276
        const int src13= src[13*srcStride];\
1277
        const int src14= src[14*srcStride];\
1278
        const int src15= src[15*srcStride];\
1279
        const int src16= src[16*srcStride];\
1280
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1281
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1282
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1283
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1284
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1285
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1286
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1287
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1288
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1289
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1290
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1291
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1292
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1293
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1294
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1295
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1296
        dst++;\
1297
        src++;\
1298
    }\
1299
}\
1300
\
1301
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1302
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1303
}\
1304
\
1305
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1306
    uint8_t half[64];\
1307
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1308
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1309
}\
1310
\
1311
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1312
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1313
}\
1314
\
1315
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1316
    uint8_t half[64];\
1317
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1318
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1319
}\
1320
\
1321
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1322
    uint8_t full[16*9];\
1323
    uint8_t half[64];\
1324
    copy_block9(full, src, 16, stride, 9);\
1325
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1326
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1327
}\
1328
\
1329
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1330
    uint8_t full[16*9];\
1331
    copy_block9(full, src, 16, stride, 9);\
1332
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1333
}\
1334
\
1335
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1336
    uint8_t full[16*9];\
1337
    uint8_t half[64];\
1338
    copy_block9(full, src, 16, stride, 9);\
1339
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1340
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1341
}\
1342
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1343
    uint8_t full[16*9];\
1344
    uint8_t halfH[72];\
1345
    uint8_t halfV[64];\
1346
    uint8_t halfHV[64];\
1347
    copy_block9(full, src, 16, stride, 9);\
1348
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1349
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1350
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1351
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1352
}\
1353
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1354
    uint8_t full[16*9];\
1355
    uint8_t halfH[72];\
1356
    uint8_t halfHV[64];\
1357
    copy_block9(full, src, 16, stride, 9);\
1358
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1359
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1360
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1361
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1362
}\
1363
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1364
    uint8_t full[16*9];\
1365
    uint8_t halfH[72];\
1366
    uint8_t halfV[64];\
1367
    uint8_t halfHV[64];\
1368
    copy_block9(full, src, 16, stride, 9);\
1369
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1370
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1371
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1372
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1373
}\
1374
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1375
    uint8_t full[16*9];\
1376
    uint8_t halfH[72];\
1377
    uint8_t halfHV[64];\
1378
    copy_block9(full, src, 16, stride, 9);\
1379
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1380
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1381
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1382
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1383
}\
1384
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1385
    uint8_t full[16*9];\
1386
    uint8_t halfH[72];\
1387
    uint8_t halfV[64];\
1388
    uint8_t halfHV[64];\
1389
    copy_block9(full, src, 16, stride, 9);\
1390
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1391
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1392
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1393
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1394
}\
1395
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1396
    uint8_t full[16*9];\
1397
    uint8_t halfH[72];\
1398
    uint8_t halfHV[64];\
1399
    copy_block9(full, src, 16, stride, 9);\
1400
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1401
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1402
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1403
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1404
}\
1405
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1406
    uint8_t full[16*9];\
1407
    uint8_t halfH[72];\
1408
    uint8_t halfV[64];\
1409
    uint8_t halfHV[64];\
1410
    copy_block9(full, src, 16, stride, 9);\
1411
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1412
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1413
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1414
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1415
}\
1416
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1417
    uint8_t full[16*9];\
1418
    uint8_t halfH[72];\
1419
    uint8_t halfHV[64];\
1420
    copy_block9(full, src, 16, stride, 9);\
1421
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1422
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1423
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1424
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1425
}\
1426
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1427
    uint8_t halfH[72];\
1428
    uint8_t halfHV[64];\
1429
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1430
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1431
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1432
}\
1433
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1434
    uint8_t halfH[72];\
1435
    uint8_t halfHV[64];\
1436
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1437
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1438
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1439
}\
1440
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1441
    uint8_t full[16*9];\
1442
    uint8_t halfH[72];\
1443
    uint8_t halfV[64];\
1444
    uint8_t halfHV[64];\
1445
    copy_block9(full, src, 16, stride, 9);\
1446
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1447
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1448
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1449
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1450
}\
1451
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1452
    uint8_t full[16*9];\
1453
    uint8_t halfH[72];\
1454
    copy_block9(full, src, 16, stride, 9);\
1455
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1456
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1457
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1458
}\
1459
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1460
    uint8_t full[16*9];\
1461
    uint8_t halfH[72];\
1462
    uint8_t halfV[64];\
1463
    uint8_t halfHV[64];\
1464
    copy_block9(full, src, 16, stride, 9);\
1465
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1466
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1467
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1468
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1469
}\
1470
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1471
    uint8_t full[16*9];\
1472
    uint8_t halfH[72];\
1473
    copy_block9(full, src, 16, stride, 9);\
1474
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1475
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1476
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1477
}\
1478
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1479
    uint8_t halfH[72];\
1480
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1481
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1482
}\
1483
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1484
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1485
}\
1486
\
1487
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1488
    uint8_t half[256];\
1489
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1490
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1491
}\
1492
\
1493
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1494
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1495
}\
1496
\
1497
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1498
    uint8_t half[256];\
1499
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1500
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1501
}\
1502
\
1503
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1504
    uint8_t full[24*17];\
1505
    uint8_t half[256];\
1506
    copy_block17(full, src, 24, stride, 17);\
1507
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1508
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1509
}\
1510
\
1511
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1512
    uint8_t full[24*17];\
1513
    copy_block17(full, src, 24, stride, 17);\
1514
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1515
}\
1516
\
1517
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1518
    uint8_t full[24*17];\
1519
    uint8_t half[256];\
1520
    copy_block17(full, src, 24, stride, 17);\
1521
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1522
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1523
}\
1524
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1525
    uint8_t full[24*17];\
1526
    uint8_t halfH[272];\
1527
    uint8_t halfV[256];\
1528
    uint8_t halfHV[256];\
1529
    copy_block17(full, src, 24, stride, 17);\
1530
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1531
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1532
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1533
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1534
}\
1535
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1536
    uint8_t full[24*17];\
1537
    uint8_t halfH[272];\
1538
    uint8_t halfHV[256];\
1539
    copy_block17(full, src, 24, stride, 17);\
1540
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1541
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1542
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1543
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1544
}\
1545
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1546
    uint8_t full[24*17];\
1547
    uint8_t halfH[272];\
1548
    uint8_t halfV[256];\
1549
    uint8_t halfHV[256];\
1550
    copy_block17(full, src, 24, stride, 17);\
1551
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1552
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1553
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1554
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1555
}\
1556
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1557
    uint8_t full[24*17];\
1558
    uint8_t halfH[272];\
1559
    uint8_t halfHV[256];\
1560
    copy_block17(full, src, 24, stride, 17);\
1561
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1562
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1563
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1564
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1565
}\
1566
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1567
    uint8_t full[24*17];\
1568
    uint8_t halfH[272];\
1569
    uint8_t halfV[256];\
1570
    uint8_t halfHV[256];\
1571
    copy_block17(full, src, 24, stride, 17);\
1572
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1573
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1574
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1575
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1576
}\
1577
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1578
    uint8_t full[24*17];\
1579
    uint8_t halfH[272];\
1580
    uint8_t halfHV[256];\
1581
    copy_block17(full, src, 24, stride, 17);\
1582
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1583
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1584
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1585
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1586
}\
1587
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1588
    uint8_t full[24*17];\
1589
    uint8_t halfH[272];\
1590
    uint8_t halfV[256];\
1591
    uint8_t halfHV[256];\
1592
    copy_block17(full, src, 24, stride, 17);\
1593
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1594
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1595
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1596
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1597
}\
1598
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1599
    uint8_t full[24*17];\
1600
    uint8_t halfH[272];\
1601
    uint8_t halfHV[256];\
1602
    copy_block17(full, src, 24, stride, 17);\
1603
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1604
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1605
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1606
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1607
}\
1608
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1609
    uint8_t halfH[272];\
1610
    uint8_t halfHV[256];\
1611
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1612
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1613
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1614
}\
1615
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1616
    uint8_t halfH[272];\
1617
    uint8_t halfHV[256];\
1618
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1619
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1620
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1621
}\
1622
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1623
    uint8_t full[24*17];\
1624
    uint8_t halfH[272];\
1625
    uint8_t halfV[256];\
1626
    uint8_t halfHV[256];\
1627
    copy_block17(full, src, 24, stride, 17);\
1628
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1629
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1630
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1631
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1632
}\
1633
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1634
    uint8_t full[24*17];\
1635
    uint8_t halfH[272];\
1636
    copy_block17(full, src, 24, stride, 17);\
1637
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1638
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1639
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1640
}\
1641
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1642
    uint8_t full[24*17];\
1643
    uint8_t halfH[272];\
1644
    uint8_t halfV[256];\
1645
    uint8_t halfHV[256];\
1646
    copy_block17(full, src, 24, stride, 17);\
1647
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1648
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1649
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1650
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1651
}\
1652
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1653
    uint8_t full[24*17];\
1654
    uint8_t halfH[272];\
1655
    copy_block17(full, src, 24, stride, 17);\
1656
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1657
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1658
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1659
}\
1660
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1661
    uint8_t halfH[272];\
1662
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1663
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1664
}
1665

    
1666
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1667
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1668
#define op_put(a, b) a = cm[((b) + 16)>>5]
1669
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1670

    
1671
QPEL_MC(0, put_       , _       , op_put)
1672
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1673
QPEL_MC(0, avg_       , _       , op_avg)
1674
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1675
#undef op_avg
1676
#undef op_avg_no_rnd
1677
#undef op_put
1678
#undef op_put_no_rnd
1679

    
1680
#if 1
1681
#define H264_LOWPASS(OPNAME, OP, OP2) \
1682
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1683
    const int h=4;\
1684
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1685
    int i;\
1686
    for(i=0; i<h; i++)\
1687
    {\
1688
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1689
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1690
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1691
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1692
        dst+=dstStride;\
1693
        src+=srcStride;\
1694
    }\
1695
}\
1696
\
1697
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1698
    const int w=4;\
1699
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1700
    int i;\
1701
    for(i=0; i<w; i++)\
1702
    {\
1703
        const int srcB= src[-2*srcStride];\
1704
        const int srcA= src[-1*srcStride];\
1705
        const int src0= src[0 *srcStride];\
1706
        const int src1= src[1 *srcStride];\
1707
        const int src2= src[2 *srcStride];\
1708
        const int src3= src[3 *srcStride];\
1709
        const int src4= src[4 *srcStride];\
1710
        const int src5= src[5 *srcStride];\
1711
        const int src6= src[6 *srcStride];\
1712
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1713
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1714
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1715
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1716
        dst++;\
1717
        src++;\
1718
    }\
1719
}\
1720
\
1721
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1722
    const int h=4;\
1723
    const int w=4;\
1724
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1725
    int i;\
1726
    src -= 2*srcStride;\
1727
    for(i=0; i<h+5; i++)\
1728
    {\
1729
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1730
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1731
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1732
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1733
        tmp+=tmpStride;\
1734
        src+=srcStride;\
1735
    }\
1736
    tmp -= tmpStride*(h+5-2);\
1737
    for(i=0; i<w; i++)\
1738
    {\
1739
        const int tmpB= tmp[-2*tmpStride];\
1740
        const int tmpA= tmp[-1*tmpStride];\
1741
        const int tmp0= tmp[0 *tmpStride];\
1742
        const int tmp1= tmp[1 *tmpStride];\
1743
        const int tmp2= tmp[2 *tmpStride];\
1744
        const int tmp3= tmp[3 *tmpStride];\
1745
        const int tmp4= tmp[4 *tmpStride];\
1746
        const int tmp5= tmp[5 *tmpStride];\
1747
        const int tmp6= tmp[6 *tmpStride];\
1748
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1749
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1750
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1751
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1752
        dst++;\
1753
        tmp++;\
1754
    }\
1755
}\
1756
\
1757
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1758
    const int h=8;\
1759
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1760
    int i;\
1761
    for(i=0; i<h; i++)\
1762
    {\
1763
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1764
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1765
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1766
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1767
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1768
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1769
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1770
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1771
        dst+=dstStride;\
1772
        src+=srcStride;\
1773
    }\
1774
}\
1775
\
1776
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1777
    const int w=8;\
1778
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1779
    int i;\
1780
    for(i=0; i<w; i++)\
1781
    {\
1782
        const int srcB= src[-2*srcStride];\
1783
        const int srcA= src[-1*srcStride];\
1784
        const int src0= src[0 *srcStride];\
1785
        const int src1= src[1 *srcStride];\
1786
        const int src2= src[2 *srcStride];\
1787
        const int src3= src[3 *srcStride];\
1788
        const int src4= src[4 *srcStride];\
1789
        const int src5= src[5 *srcStride];\
1790
        const int src6= src[6 *srcStride];\
1791
        const int src7= src[7 *srcStride];\
1792
        const int src8= src[8 *srcStride];\
1793
        const int src9= src[9 *srcStride];\
1794
        const int src10=src[10*srcStride];\
1795
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1796
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1797
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1798
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1799
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1800
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1801
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1802
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1803
        dst++;\
1804
        src++;\
1805
    }\
1806
}\
1807
\
1808
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1809
    const int h=8;\
1810
    const int w=8;\
1811
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812
    int i;\
1813
    src -= 2*srcStride;\
1814
    for(i=0; i<h+5; i++)\
1815
    {\
1816
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1817
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1818
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1819
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1820
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1821
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1822
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1823
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1824
        tmp+=tmpStride;\
1825
        src+=srcStride;\
1826
    }\
1827
    tmp -= tmpStride*(h+5-2);\
1828
    for(i=0; i<w; i++)\
1829
    {\
1830
        const int tmpB= tmp[-2*tmpStride];\
1831
        const int tmpA= tmp[-1*tmpStride];\
1832
        const int tmp0= tmp[0 *tmpStride];\
1833
        const int tmp1= tmp[1 *tmpStride];\
1834
        const int tmp2= tmp[2 *tmpStride];\
1835
        const int tmp3= tmp[3 *tmpStride];\
1836
        const int tmp4= tmp[4 *tmpStride];\
1837
        const int tmp5= tmp[5 *tmpStride];\
1838
        const int tmp6= tmp[6 *tmpStride];\
1839
        const int tmp7= tmp[7 *tmpStride];\
1840
        const int tmp8= tmp[8 *tmpStride];\
1841
        const int tmp9= tmp[9 *tmpStride];\
1842
        const int tmp10=tmp[10*tmpStride];\
1843
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1844
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1845
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1846
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1847
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1848
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1849
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1850
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1851
        dst++;\
1852
        tmp++;\
1853
    }\
1854
}\
1855
\
1856
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1857
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1858
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1859
    src += 8*srcStride;\
1860
    dst += 8*dstStride;\
1861
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1862
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1863
}\
1864
\
1865
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1866
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1867
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1868
    src += 8*srcStride;\
1869
    dst += 8*dstStride;\
1870
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1871
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1872
}\
1873
\
1874
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1875
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1876
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1877
    src += 8*srcStride;\
1878
    tmp += 8*tmpStride;\
1879
    dst += 8*dstStride;\
1880
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
1881
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1882
}\
1883

    
1884
#define H264_MC(OPNAME, SIZE) \
1885
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1886
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1887
}\
1888
\
1889
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1890
    uint8_t half[SIZE*SIZE];\
1891
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1892
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1893
}\
1894
\
1895
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1896
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1897
}\
1898
\
1899
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1900
    uint8_t half[SIZE*SIZE];\
1901
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1902
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1903
}\
1904
\
1905
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1906
    uint8_t full[SIZE*(SIZE+5)];\
1907
    uint8_t * const full_mid= full + SIZE*2;\
1908
    uint8_t half[SIZE*SIZE];\
1909
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1910
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1911
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1912
}\
1913
\
1914
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1915
    uint8_t full[SIZE*(SIZE+5)];\
1916
    uint8_t * const full_mid= full + SIZE*2;\
1917
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1918
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1919
}\
1920
\
1921
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1922
    uint8_t full[SIZE*(SIZE+5)];\
1923
    uint8_t * const full_mid= full + SIZE*2;\
1924
    uint8_t half[SIZE*SIZE];\
1925
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1926
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1927
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1928
}\
1929
\
1930
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1931
    uint8_t full[SIZE*(SIZE+5)];\
1932
    uint8_t * const full_mid= full + SIZE*2;\
1933
    uint8_t halfH[SIZE*SIZE];\
1934
    uint8_t halfV[SIZE*SIZE];\
1935
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1936
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1937
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1938
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1939
}\
1940
\
1941
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1942
    uint8_t full[SIZE*(SIZE+5)];\
1943
    uint8_t * const full_mid= full + SIZE*2;\
1944
    uint8_t halfH[SIZE*SIZE];\
1945
    uint8_t halfV[SIZE*SIZE];\
1946
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1947
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1948
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1949
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1950
}\
1951
\
1952
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1953
    uint8_t full[SIZE*(SIZE+5)];\
1954
    uint8_t * const full_mid= full + SIZE*2;\
1955
    uint8_t halfH[SIZE*SIZE];\
1956
    uint8_t halfV[SIZE*SIZE];\
1957
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1958
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
1959
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1960
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1961
}\
1962
\
1963
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1964
    uint8_t full[SIZE*(SIZE+5)];\
1965
    uint8_t * const full_mid= full + SIZE*2;\
1966
    uint8_t halfH[SIZE*SIZE];\
1967
    uint8_t halfV[SIZE*SIZE];\
1968
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1969
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
1970
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1971
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1972
}\
1973
\
1974
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1975
    int16_t tmp[SIZE*(SIZE+5)];\
1976
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1977
}\
1978
\
1979
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1980
    int16_t tmp[SIZE*(SIZE+5)];\
1981
    uint8_t halfH[SIZE*SIZE];\
1982
    uint8_t halfHV[SIZE*SIZE];\
1983
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1984
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1985
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1986
}\
1987
\
1988
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1989
    int16_t tmp[SIZE*(SIZE+5)];\
1990
    uint8_t halfH[SIZE*SIZE];\
1991
    uint8_t halfHV[SIZE*SIZE];\
1992
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1993
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1994
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1995
}\
1996
\
1997
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1998
    uint8_t full[SIZE*(SIZE+5)];\
1999
    uint8_t * const full_mid= full + SIZE*2;\
2000
    int16_t tmp[SIZE*(SIZE+5)];\
2001
    uint8_t halfV[SIZE*SIZE];\
2002
    uint8_t halfHV[SIZE*SIZE];\
2003
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2004
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2005
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2006
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2007
}\
2008
\
2009
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    uint8_t full[SIZE*(SIZE+5)];\
2011
    uint8_t * const full_mid= full + SIZE*2;\
2012
    int16_t tmp[SIZE*(SIZE+5)];\
2013
    uint8_t halfV[SIZE*SIZE];\
2014
    uint8_t halfHV[SIZE*SIZE];\
2015
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2016
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2017
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2018
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2019
}\
2020

    
2021
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2022
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2023
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2024
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2025
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2026

    
2027
H264_LOWPASS(put_       , op_put, op2_put)
2028
H264_LOWPASS(avg_       , op_avg, op2_avg)
2029
H264_MC(put_, 4)
2030
H264_MC(put_, 8)
2031
H264_MC(put_, 16)
2032
H264_MC(avg_, 4)
2033
H264_MC(avg_, 8)
2034
H264_MC(avg_, 16)
2035

    
2036
#undef op_avg
2037
#undef op_put
2038
#undef op2_avg
2039
#undef op2_put
2040
#endif
2041

    
2042
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2043
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2044
    int i;
2045

    
2046
    for(i=0; i<h; i++){
2047
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2048
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2049
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2050
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2051
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2052
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2053
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2054
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2055
        dst+=dstStride;
2056
        src+=srcStride;        
2057
    }
2058
}
2059

    
2060
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2061
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2062
    int i;
2063

    
2064
    for(i=0; i<w; i++){
2065
        const int src_1= src[ -srcStride];
2066
        const int src0 = src[0          ];
2067
        const int src1 = src[  srcStride];
2068
        const int src2 = src[2*srcStride];
2069
        const int src3 = src[3*srcStride];
2070
        const int src4 = src[4*srcStride];
2071
        const int src5 = src[5*srcStride];
2072
        const int src6 = src[6*srcStride];
2073
        const int src7 = src[7*srcStride];
2074
        const int src8 = src[8*srcStride];
2075
        const int src9 = src[9*srcStride];
2076
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2077
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2078
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2079
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2080
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2081
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2082
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2083
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2084
        src++;
2085
        dst++;
2086
    }
2087
}
2088

    
2089
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2090
    put_pixels8_c(dst, src, stride, 8);
2091
}
2092

    
2093
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2094
    uint8_t half[64];
2095
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2096
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2097
}
2098

    
2099
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2100
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2101
}
2102

    
2103
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2104
    uint8_t half[64];
2105
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2106
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2107
}
2108

    
2109
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2110
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2111
}
2112

    
2113
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2114
    uint8_t halfH[88];
2115
    uint8_t halfV[64];
2116
    uint8_t halfHV[64];
2117
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2118
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2119
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2120
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2121
}
2122
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2123
    uint8_t halfH[88];
2124
    uint8_t halfV[64];
2125
    uint8_t halfHV[64];
2126
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2127
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2128
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2129
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2130
}
2131
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2132
    uint8_t halfH[88];
2133
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2134
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2135
}
2136

    
2137

    
2138
static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2139
{
2140
    int s, i;
2141

    
2142
    s = 0;
2143
    for(i=0;i<16;i++) {
2144
        s += abs(pix1[0] - pix2[0]);
2145
        s += abs(pix1[1] - pix2[1]);
2146
        s += abs(pix1[2] - pix2[2]);
2147
        s += abs(pix1[3] - pix2[3]);
2148
        s += abs(pix1[4] - pix2[4]);
2149
        s += abs(pix1[5] - pix2[5]);
2150
        s += abs(pix1[6] - pix2[6]);
2151
        s += abs(pix1[7] - pix2[7]);
2152
        s += abs(pix1[8] - pix2[8]);
2153
        s += abs(pix1[9] - pix2[9]);
2154
        s += abs(pix1[10] - pix2[10]);
2155
        s += abs(pix1[11] - pix2[11]);
2156
        s += abs(pix1[12] - pix2[12]);
2157
        s += abs(pix1[13] - pix2[13]);
2158
        s += abs(pix1[14] - pix2[14]);
2159
        s += abs(pix1[15] - pix2[15]);
2160
        pix1 += line_size;
2161
        pix2 += line_size;
2162
    }
2163
    return s;
2164
}
2165

    
2166
static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2167
{
2168
    int s, i;
2169

    
2170
    s = 0;
2171
    for(i=0;i<16;i++) {
2172
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2173
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2174
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2175
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2176
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2177
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2178
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2179
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2180
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2181
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2182
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2183
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2184
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2185
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2186
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2187
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2188
        pix1 += line_size;
2189
        pix2 += line_size;
2190
    }
2191
    return s;
2192
}
2193

    
2194
static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2195
{
2196
    int s, i;
2197
    uint8_t *pix3 = pix2 + line_size;
2198

    
2199
    s = 0;
2200
    for(i=0;i<16;i++) {
2201
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2202
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2203
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2204
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2205
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2206
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2207
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2208
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2209
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2210
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2211
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2212
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2213
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2214
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2215
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2216
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2217
        pix1 += line_size;
2218
        pix2 += line_size;
2219
        pix3 += line_size;
2220
    }
2221
    return s;
2222
}
2223

    
2224
static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2225
{
2226
    int s, i;
2227
    uint8_t *pix3 = pix2 + line_size;
2228

    
2229
    s = 0;
2230
    for(i=0;i<16;i++) {
2231
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2232
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2233
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2234
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2235
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2236
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2237
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2238
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2239
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2240
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2241
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2242
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2243
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2244
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2245
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2246
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2247
        pix1 += line_size;
2248
        pix2 += line_size;
2249
        pix3 += line_size;
2250
    }
2251
    return s;
2252
}
2253

    
2254
static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2255
{
2256
    int s, i;
2257

    
2258
    s = 0;
2259
    for(i=0;i<8;i++) {
2260
        s += abs(pix1[0] - pix2[0]);
2261
        s += abs(pix1[1] - pix2[1]);
2262
        s += abs(pix1[2] - pix2[2]);
2263
        s += abs(pix1[3] - pix2[3]);
2264
        s += abs(pix1[4] - pix2[4]);
2265
        s += abs(pix1[5] - pix2[5]);
2266
        s += abs(pix1[6] - pix2[6]);
2267
        s += abs(pix1[7] - pix2[7]);
2268
        pix1 += line_size;
2269
        pix2 += line_size;
2270
    }
2271
    return s;
2272
}
2273

    
2274
static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2275
{
2276
    int s, i;
2277

    
2278
    s = 0;
2279
    for(i=0;i<8;i++) {
2280
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2281
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2282
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2283
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2284
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2285
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2286
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2287
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2288
        pix1 += line_size;
2289
        pix2 += line_size;
2290
    }
2291
    return s;
2292
}
2293

    
2294
static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2295
{
2296
    int s, i;
2297
    uint8_t *pix3 = pix2 + line_size;
2298

    
2299
    s = 0;
2300
    for(i=0;i<8;i++) {
2301
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2302
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2303
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2304
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2305
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2306
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2307
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2308
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2309
        pix1 += line_size;
2310
        pix2 += line_size;
2311
        pix3 += line_size;
2312
    }
2313
    return s;
2314
}
2315

    
2316
static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2317
{
2318
    int s, i;
2319
    uint8_t *pix3 = pix2 + line_size;
2320

    
2321
    s = 0;
2322
    for(i=0;i<8;i++) {
2323
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2324
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2325
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2326
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2327
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2328
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2329
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2330
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2331
        pix1 += line_size;
2332
        pix2 += line_size;
2333
        pix3 += line_size;
2334
    }
2335
    return s;
2336
}
2337

    
2338
static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2339
    return pix_abs16x16_c(a,b,stride);
2340
}
2341

    
2342
static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2343
    return pix_abs8x8_c(a,b,stride);
2344
}
2345

    
2346
/**
2347
 * permutes an 8x8 block.
2348
 * @param block the block which will be permuted according to the given permutation vector
2349
 * @param permutation the permutation vector
2350
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2351
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2352
 *                  (inverse) permutated to scantable order!
2353
 */
2354
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2355
{
2356
    int i;
2357
    DCTELEM temp[64];
2358
    
2359
    if(last<=0) return;
2360
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2361

    
2362
    for(i=0; i<=last; i++){
2363
        const int j= scantable[i];
2364
        temp[j]= block[j];
2365
        block[j]=0;
2366
    }
2367
    
2368
    for(i=0; i<=last; i++){
2369
        const int j= scantable[i];
2370
        const int perm_j= permutation[j];
2371
        block[perm_j]= temp[j];
2372
    }
2373
}
2374

    
2375
/**
2376
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2377
 */
2378
static void clear_blocks_c(DCTELEM *blocks)
2379
{
2380
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2381
}
2382

    
2383
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2384
    int i;
2385
    for(i=0; i+7<w; i+=8){
2386
        dst[i+0] += src[i+0];
2387
        dst[i+1] += src[i+1];
2388
        dst[i+2] += src[i+2];
2389
        dst[i+3] += src[i+3];
2390
        dst[i+4] += src[i+4];
2391
        dst[i+5] += src[i+5];
2392
        dst[i+6] += src[i+6];
2393
        dst[i+7] += src[i+7];
2394
    }
2395
    for(; i<w; i++)
2396
        dst[i+0] += src[i+0];
2397
}
2398

    
2399
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2400
    int i;
2401
    for(i=0; i+7<w; i+=8){
2402
        dst[i+0] = src1[i+0]-src2[i+0];
2403
        dst[i+1] = src1[i+1]-src2[i+1];
2404
        dst[i+2] = src1[i+2]-src2[i+2];
2405
        dst[i+3] = src1[i+3]-src2[i+3];
2406
        dst[i+4] = src1[i+4]-src2[i+4];
2407
        dst[i+5] = src1[i+5]-src2[i+5];
2408
        dst[i+6] = src1[i+6]-src2[i+6];
2409
        dst[i+7] = src1[i+7]-src2[i+7];
2410
    }
2411
    for(; i<w; i++)
2412
        dst[i+0] = src1[i+0]-src2[i+0];
2413
}
2414

    
2415
#define BUTTERFLY2(o1,o2,i1,i2) \
2416
o1= (i1)+(i2);\
2417
o2= (i1)-(i2);
2418

    
2419
#define BUTTERFLY1(x,y) \
2420
{\
2421
    int a,b;\
2422
    a= x;\
2423
    b= y;\
2424
    x= a+b;\
2425
    y= a-b;\
2426
}
2427

    
2428
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2429

    
2430
static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2431
    int i;
2432
    int temp[64];
2433
    int sum=0;
2434

    
2435
    for(i=0; i<8; i++){
2436
        //FIXME try pointer walks
2437
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2438
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2439
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2440
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2441
        
2442
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2443
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2444
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2445
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2446
        
2447
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2448
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2449
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2450
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2451
    }
2452

    
2453
    for(i=0; i<8; i++){
2454
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2455
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2456
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2457
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2458
        
2459
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2460
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2461
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2462
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2463

    
2464
        sum += 
2465
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2466
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2467
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2468
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2469
    }
2470
#if 0
2471
static int maxi=0;
2472
if(sum>maxi){
2473
    maxi=sum;
2474
    printf("MAX:%d\n", maxi);
2475
}
2476
#endif
2477
    return sum;
2478
}
2479

    
2480
static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2481
    int i;
2482
    int temp[64];
2483
    int sum=0;
2484
//FIXME OOOPS ignore 0 term instead of mean mess
2485
    for(i=0; i<8; i++){
2486
        //FIXME try pointer walks
2487
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2488
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2489
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2490
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2491
        
2492
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2493
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2494
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2495
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2496
        
2497
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2498
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2499
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2500
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2501
    }
2502

    
2503
    for(i=0; i<8; i++){
2504
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2505
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2506
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2507
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2508
        
2509
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2510
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2511
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2512
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2513
    
2514
        sum += 
2515
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2516
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2517
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2518
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2519
    }
2520
    
2521
    return sum;
2522
}
2523

    
2524
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2525
    MpegEncContext * const s= (MpegEncContext *)c;
2526
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2527
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2528
    int sum=0, i;
2529

    
2530
    s->dsp.diff_pixels(temp, src1, src2, stride);
2531
    s->dsp.fdct(temp);
2532

    
2533
    for(i=0; i<64; i++)
2534
        sum+= ABS(temp[i]);
2535
        
2536
    return sum;
2537
}
2538

    
2539
void simple_idct(DCTELEM *block); //FIXME
2540

    
2541
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2542
    MpegEncContext * const s= (MpegEncContext *)c;
2543
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2544
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2545
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2546
    int sum=0, i;
2547

    
2548
    s->mb_intra=0;
2549
    
2550
    s->dsp.diff_pixels(temp, src1, src2, stride);
2551
    
2552
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2553
    
2554
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2555
    s->dct_unquantize(s, temp, 0, s->qscale);
2556
    simple_idct(temp); //FIXME 
2557
    
2558
    for(i=0; i<64; i++)
2559
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2560
        
2561
    return sum;
2562
}
2563

    
2564
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2565
    MpegEncContext * const s= (MpegEncContext *)c;
2566
    const uint8_t *scantable= s->intra_scantable.permutated;
2567
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2568
    uint64_t __align8 aligned_bak[stride];
2569
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2570
    uint8_t * const bak= (uint8_t*)aligned_bak;
2571
    int i, last, run, bits, level, distoration, start_i;
2572
    const int esc_length= s->ac_esc_length;
2573
    uint8_t * length;
2574
    uint8_t * last_length;
2575
    
2576
    for(i=0; i<8; i++){
2577
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2578
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2579
    }
2580

    
2581
    s->dsp.diff_pixels(temp, src1, src2, stride);
2582

    
2583
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2584

    
2585
    bits=0;
2586
    
2587
    if (s->mb_intra) {
2588
        start_i = 1; 
2589
        length     = s->intra_ac_vlc_length;
2590
        last_length= s->intra_ac_vlc_last_length;
2591
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2592
    } else {
2593
        start_i = 0;
2594
        length     = s->inter_ac_vlc_length;
2595
        last_length= s->inter_ac_vlc_last_length;
2596
    }
2597
    
2598
    if(last>=start_i){
2599
        run=0;
2600
        for(i=start_i; i<last; i++){
2601
            int j= scantable[i];
2602
            level= temp[j];
2603
        
2604
            if(level){
2605
                level+=64;
2606
                if((level&(~127)) == 0){
2607
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2608
                }else
2609
                    bits+= esc_length;
2610
                run=0;
2611
            }else
2612
                run++;
2613
        }
2614
        i= scantable[last];
2615
       
2616
        level= temp[i] + 64;
2617

    
2618
        assert(level - 64);
2619
        
2620
        if((level&(~127)) == 0){
2621
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2622
        }else
2623
            bits+= esc_length;
2624
    
2625
    }
2626

    
2627
    if(last>=0){
2628
        s->dct_unquantize(s, temp, 0, s->qscale);
2629
    }
2630
    
2631
    s->dsp.idct_add(bak, stride, temp);
2632
    
2633
    distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2634

    
2635
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2636
}
2637

    
2638
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2639
    MpegEncContext * const s= (MpegEncContext *)c;
2640
    const uint8_t *scantable= s->intra_scantable.permutated;
2641
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2642
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2643
    int i, last, run, bits, level, start_i;
2644
    const int esc_length= s->ac_esc_length;
2645
    uint8_t * length;
2646
    uint8_t * last_length;
2647
    
2648
    s->dsp.diff_pixels(temp, src1, src2, stride);
2649

    
2650
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2651

    
2652
    bits=0;
2653
    
2654
    if (s->mb_intra) {
2655
        start_i = 1; 
2656
        length     = s->intra_ac_vlc_length;
2657
        last_length= s->intra_ac_vlc_last_length;
2658
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2659
    } else {
2660
        start_i = 0;
2661
        length     = s->inter_ac_vlc_length;
2662
        last_length= s->inter_ac_vlc_last_length;
2663
    }
2664
    
2665
    if(last>=start_i){
2666
        run=0;
2667
        for(i=start_i; i<last; i++){
2668
            int j= scantable[i];
2669
            level= temp[j];
2670
        
2671
            if(level){
2672
                level+=64;
2673
                if((level&(~127)) == 0){
2674
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2675
                }else
2676
                    bits+= esc_length;
2677
                run=0;
2678
            }else
2679
                run++;
2680
        }
2681
        i= scantable[last];
2682
                
2683
        level= temp[i] + 64;
2684
        
2685
        assert(level - 64);
2686
        
2687
        if((level&(~127)) == 0){
2688
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2689
        }else
2690
            bits+= esc_length;
2691
    }
2692

    
2693
    return bits;
2694
}
2695

    
2696

    
2697
WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2698
WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2699
WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2700
WARPER88_1616(rd8x8_c, rd16x16_c)
2701
WARPER88_1616(bit8x8_c, bit16x16_c)
2702

    
2703
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2704
 converted */
2705
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2706
{
2707
    j_rev_dct (block);
2708
    put_pixels_clamped_c(block, dest, line_size);
2709
}
2710
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2711
{
2712
    j_rev_dct (block);
2713
    add_pixels_clamped_c(block, dest, line_size);
2714
}
2715

    
2716
/* init static data */
2717
void dsputil_static_init(void)
2718
{
2719
    int i;
2720

    
2721
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2722
    for(i=0;i<MAX_NEG_CROP;i++) {
2723
        cropTbl[i] = 0;
2724
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
2725
    }
2726
    
2727
    for(i=0;i<512;i++) {
2728
        squareTbl[i] = (i - 256) * (i - 256);
2729
    }
2730
    
2731
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2732
}
2733

    
2734

    
2735
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2736
{
2737
    int i;
2738

    
2739
#ifdef CONFIG_ENCODERS
2740
    if(avctx->dct_algo==FF_DCT_FASTINT)
2741
        c->fdct = fdct_ifast;
2742
    else
2743
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2744
#endif //CONFIG_ENCODERS
2745

    
2746
    if(avctx->idct_algo==FF_IDCT_INT){
2747
        c->idct_put= ff_jref_idct_put;
2748
        c->idct_add= ff_jref_idct_add;
2749
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2750
    }else{ //accurate/default
2751
        c->idct_put= simple_idct_put;
2752
        c->idct_add= simple_idct_add;
2753
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2754
    }
2755

    
2756
    c->get_pixels = get_pixels_c;
2757
    c->diff_pixels = diff_pixels_c;
2758
    c->put_pixels_clamped = put_pixels_clamped_c;
2759
    c->add_pixels_clamped = add_pixels_clamped_c;
2760
    c->gmc1 = gmc1_c;
2761
    c->gmc = gmc_c;
2762
    c->clear_blocks = clear_blocks_c;
2763
    c->pix_sum = pix_sum_c;
2764
    c->pix_norm1 = pix_norm1_c;
2765
    c->sse[0]= sse16_c;
2766
    c->sse[1]= sse8_c;
2767

    
2768
    /* TODO [0] 16  [1] 8 */
2769
    c->pix_abs16x16     = pix_abs16x16_c;
2770
    c->pix_abs16x16_x2  = pix_abs16x16_x2_c;
2771
    c->pix_abs16x16_y2  = pix_abs16x16_y2_c;
2772
    c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2773
    c->pix_abs8x8     = pix_abs8x8_c;
2774
    c->pix_abs8x8_x2  = pix_abs8x8_x2_c;
2775
    c->pix_abs8x8_y2  = pix_abs8x8_y2_c;
2776
    c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2777

    
2778
#define dspfunc(PFX, IDX, NUM) \
2779
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2780
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2781
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2782
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2783

    
2784
    dspfunc(put, 0, 16);
2785
    dspfunc(put_no_rnd, 0, 16);
2786
    dspfunc(put, 1, 8);
2787
    dspfunc(put_no_rnd, 1, 8);
2788
    dspfunc(put, 2, 4);
2789
    dspfunc(put, 3, 2);
2790

    
2791
    dspfunc(avg, 0, 16);
2792
    dspfunc(avg_no_rnd, 0, 16);
2793
    dspfunc(avg, 1, 8);
2794
    dspfunc(avg_no_rnd, 1, 8);
2795
#undef dspfunc
2796

    
2797
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2798
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2799
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2800
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2801
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2802
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2803
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2804
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2805
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2806

    
2807
#define dspfunc(PFX, IDX, NUM) \
2808
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2809
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2810
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2811
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2812
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2813
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2814
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2815
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2816
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2817
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2818
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2819
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2820
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2821
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2822
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2823
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2824

    
2825
    dspfunc(put_qpel, 0, 16);
2826
    dspfunc(put_no_rnd_qpel, 0, 16);
2827

    
2828
    dspfunc(avg_qpel, 0, 16);
2829
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2830

    
2831
    dspfunc(put_qpel, 1, 8);
2832
    dspfunc(put_no_rnd_qpel, 1, 8);
2833

    
2834
    dspfunc(avg_qpel, 1, 8);
2835
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2836

    
2837
    dspfunc(put_h264_qpel, 0, 16);
2838
    dspfunc(put_h264_qpel, 1, 8);
2839
    dspfunc(put_h264_qpel, 2, 4);
2840
    dspfunc(avg_h264_qpel, 0, 16);
2841
    dspfunc(avg_h264_qpel, 1, 8);
2842
    dspfunc(avg_h264_qpel, 2, 4);
2843

    
2844
#undef dspfunc
2845
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2846
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2847
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2848
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2849
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2850
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2851

    
2852
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2853
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2854
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2855
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2856
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2857
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2858
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2859
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2860
        
2861
    c->hadamard8_diff[0]= hadamard8_diff16_c;
2862
    c->hadamard8_diff[1]= hadamard8_diff_c;
2863
    c->hadamard8_abs = hadamard8_abs_c;
2864
    
2865
    c->dct_sad[0]= dct_sad16x16_c;
2866
    c->dct_sad[1]= dct_sad8x8_c;
2867
    
2868
    c->sad[0]= sad16x16_c;
2869
    c->sad[1]= sad8x8_c;
2870
    
2871
    c->quant_psnr[0]= quant_psnr16x16_c;
2872
    c->quant_psnr[1]= quant_psnr8x8_c;
2873

    
2874
    c->rd[0]= rd16x16_c;
2875
    c->rd[1]= rd8x8_c;
2876

    
2877
    c->bit[0]= bit16x16_c;
2878
    c->bit[1]= bit8x8_c;
2879
        
2880
    c->add_bytes= add_bytes_c;
2881
    c->diff_bytes= diff_bytes_c;
2882

    
2883
#ifdef HAVE_MMX
2884
    dsputil_init_mmx(c, avctx);
2885
#endif
2886
#ifdef ARCH_ARMV4L
2887
    dsputil_init_armv4l(c, avctx);
2888
#endif
2889
#ifdef HAVE_MLIB
2890
    dsputil_init_mlib(c, avctx);
2891
#endif
2892
#ifdef ARCH_ALPHA
2893
    dsputil_init_alpha(c, avctx);
2894
#endif
2895
#ifdef ARCH_POWERPC
2896
    dsputil_init_ppc(c, avctx);
2897
#endif
2898
#ifdef HAVE_MMI
2899
    dsputil_init_mmi(c, avctx);
2900
#endif
2901
#ifdef ARCH_SH4
2902
    dsputil_init_sh4(c,avctx);
2903
#endif
2904

    
2905
    switch(c->idct_permutation_type){
2906
    case FF_NO_IDCT_PERM:
2907
        for(i=0; i<64; i++)
2908
            c->idct_permutation[i]= i;
2909
        break;
2910
    case FF_LIBMPEG2_IDCT_PERM:
2911
        for(i=0; i<64; i++)
2912
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2913
        break;
2914
    case FF_SIMPLE_IDCT_PERM:
2915
        for(i=0; i<64; i++)
2916
            c->idct_permutation[i]= simple_mmx_permutation[i];
2917
        break;
2918
    case FF_TRANSPOSE_IDCT_PERM:
2919
        for(i=0; i<64; i++)
2920
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2921
        break;
2922
    default:
2923
        fprintf(stderr, "Internal error, IDCT permutation not set\n");
2924
    }
2925
}
2926