Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 58c2182d

History | View | Annotate | Download (121 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
 
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27
 
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35
uint32_t squareTbl[512];
36

    
37
const uint8_t ff_zigzag_direct[64] = {
38
    0,   1,  8, 16,  9,  2,  3, 10,
39
    17, 24, 32, 25, 18, 11,  4,  5,
40
    12, 19, 26, 33, 40, 48, 41, 34,
41
    27, 20, 13,  6,  7, 14, 21, 28,
42
    35, 42, 49, 56, 57, 50, 43, 36,
43
    29, 22, 15, 23, 30, 37, 44, 51,
44
    58, 59, 52, 45, 38, 31, 39, 46,
45
    53, 60, 61, 54, 47, 55, 62, 63
46
};
47

    
48
/* Specific zigzag scan for 248 idct. NOTE that unlike the
49
   specification, we interleave the fields */
50
const uint8_t ff_zigzag248_direct[64] = {
51
     0,  8,  1,  9, 16, 24,  2, 10,
52
    17, 25, 32, 40, 48, 56, 33, 41,
53
    18, 26,  3, 11,  4, 12, 19, 27,
54
    34, 42, 49, 57, 50, 58, 35, 43,
55
    20, 28,  5, 13,  6, 14, 21, 29,
56
    36, 44, 51, 59, 52, 60, 37, 45,
57
    22, 30,  7, 15, 23, 31, 38, 46,
58
    53, 61, 54, 62, 39, 47, 55, 63,
59
};
60

    
61
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62
uint16_t __align8 inv_zigzag_direct16[64];
63

    
64
const uint8_t ff_alternate_horizontal_scan[64] = {
65
    0,  1,   2,  3,  8,  9, 16, 17, 
66
    10, 11,  4,  5,  6,  7, 15, 14,
67
    13, 12, 19, 18, 24, 25, 32, 33, 
68
    26, 27, 20, 21, 22, 23, 28, 29,
69
    30, 31, 34, 35, 40, 41, 48, 49, 
70
    42, 43, 36, 37, 38, 39, 44, 45,
71
    46, 47, 50, 51, 56, 57, 58, 59, 
72
    52, 53, 54, 55, 60, 61, 62, 63,
73
};
74

    
75
const uint8_t ff_alternate_vertical_scan[64] = {
76
    0,  8,  16, 24,  1,  9,  2, 10, 
77
    17, 25, 32, 40, 48, 56, 57, 49,
78
    41, 33, 26, 18,  3, 11,  4, 12, 
79
    19, 27, 34, 42, 50, 58, 35, 43,
80
    51, 59, 20, 28,  5, 13,  6, 14, 
81
    21, 29, 36, 44, 52, 60, 37, 45,
82
    53, 61, 22, 30,  7, 15, 23, 31, 
83
    38, 46, 54, 62, 39, 47, 55, 63,
84
};
85

    
86
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87
const uint32_t inverse[256]={
88
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
89
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
90
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
91
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
92
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
93
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
94
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
95
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
96
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
97
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
98
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
99
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
100
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
101
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
102
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
103
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
104
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
105
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
106
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
107
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
108
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
109
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
110
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
111
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
112
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
113
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
114
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
115
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
116
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
117
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
118
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
119
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120
};
121

    
122
/* Input permutation for the simple_idct_mmx */
123
static const uint8_t simple_mmx_permutation[64]={
124
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
125
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
126
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
127
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
128
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
129
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
130
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
131
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132
};
133

    
134
static int pix_sum_c(uint8_t * pix, int line_size)
135
{
136
    int s, i, j;
137

    
138
    s = 0;
139
    for (i = 0; i < 16; i++) {
140
        for (j = 0; j < 16; j += 8) {
141
            s += pix[0];
142
            s += pix[1];
143
            s += pix[2];
144
            s += pix[3];
145
            s += pix[4];
146
            s += pix[5];
147
            s += pix[6];
148
            s += pix[7];
149
            pix += 8;
150
        }
151
        pix += line_size - 16;
152
    }
153
    return s;
154
}
155

    
156
static int pix_norm1_c(uint8_t * pix, int line_size)
157
{
158
    int s, i, j;
159
    uint32_t *sq = squareTbl + 256;
160

    
161
    s = 0;
162
    for (i = 0; i < 16; i++) {
163
        for (j = 0; j < 16; j += 8) {
164
#if 0
165
            s += sq[pix[0]];
166
            s += sq[pix[1]];
167
            s += sq[pix[2]];
168
            s += sq[pix[3]];
169
            s += sq[pix[4]];
170
            s += sq[pix[5]];
171
            s += sq[pix[6]];
172
            s += sq[pix[7]];
173
#else
174
#if LONG_MAX > 2147483647
175
            register uint64_t x=*(uint64_t*)pix;
176
            s += sq[x&0xff];
177
            s += sq[(x>>8)&0xff];
178
            s += sq[(x>>16)&0xff];
179
            s += sq[(x>>24)&0xff];
180
            s += sq[(x>>32)&0xff];
181
            s += sq[(x>>40)&0xff];
182
            s += sq[(x>>48)&0xff];
183
            s += sq[(x>>56)&0xff];
184
#else
185
            register uint32_t x=*(uint32_t*)pix;
186
            s += sq[x&0xff];
187
            s += sq[(x>>8)&0xff];
188
            s += sq[(x>>16)&0xff];
189
            s += sq[(x>>24)&0xff];
190
            x=*(uint32_t*)(pix+4);
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
#endif
196
#endif
197
            pix += 8;
198
        }
199
        pix += line_size - 16;
200
    }
201
    return s;
202
}
203

    
204
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205
    int i;
206
    
207
    for(i=0; i+8<=w; i+=8){
208
        dst[i+0]= bswap_32(src[i+0]);
209
        dst[i+1]= bswap_32(src[i+1]);
210
        dst[i+2]= bswap_32(src[i+2]);
211
        dst[i+3]= bswap_32(src[i+3]);
212
        dst[i+4]= bswap_32(src[i+4]);
213
        dst[i+5]= bswap_32(src[i+5]);
214
        dst[i+6]= bswap_32(src[i+6]);
215
        dst[i+7]= bswap_32(src[i+7]);
216
    }
217
    for(;i<w; i++){
218
        dst[i+0]= bswap_32(src[i+0]);
219
    }
220
}
221

    
222
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223
{
224
    int s, i;
225
    uint32_t *sq = squareTbl + 256;
226

    
227
    s = 0;
228
    for (i = 0; i < h; i++) {
229
        s += sq[pix1[0] - pix2[0]];
230
        s += sq[pix1[1] - pix2[1]];
231
        s += sq[pix1[2] - pix2[2]];
232
        s += sq[pix1[3] - pix2[3]];
233
        s += sq[pix1[4] - pix2[4]];
234
        s += sq[pix1[5] - pix2[5]];
235
        s += sq[pix1[6] - pix2[6]];
236
        s += sq[pix1[7] - pix2[7]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242

    
243
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[ 0] - pix2[ 0]];
251
        s += sq[pix1[ 1] - pix2[ 1]];
252
        s += sq[pix1[ 2] - pix2[ 2]];
253
        s += sq[pix1[ 3] - pix2[ 3]];
254
        s += sq[pix1[ 4] - pix2[ 4]];
255
        s += sq[pix1[ 5] - pix2[ 5]];
256
        s += sq[pix1[ 6] - pix2[ 6]];
257
        s += sq[pix1[ 7] - pix2[ 7]];
258
        s += sq[pix1[ 8] - pix2[ 8]];
259
        s += sq[pix1[ 9] - pix2[ 9]];
260
        s += sq[pix1[10] - pix2[10]];
261
        s += sq[pix1[11] - pix2[11]];
262
        s += sq[pix1[12] - pix2[12]];
263
        s += sq[pix1[13] - pix2[13]];
264
        s += sq[pix1[14] - pix2[14]];
265
        s += sq[pix1[15] - pix2[15]];
266

    
267
        pix1 += line_size;
268
        pix2 += line_size;
269
    }
270
    return s;
271
}
272

    
273
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274
{
275
    int i;
276

    
277
    /* read the pixels */
278
    for(i=0;i<8;i++) {
279
        block[0] = pixels[0];
280
        block[1] = pixels[1];
281
        block[2] = pixels[2];
282
        block[3] = pixels[3];
283
        block[4] = pixels[4];
284
        block[5] = pixels[5];
285
        block[6] = pixels[6];
286
        block[7] = pixels[7];
287
        pixels += line_size;
288
        block += 8;
289
    }
290
}
291

    
292
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293
                          const uint8_t *s2, int stride){
294
    int i;
295

    
296
    /* read the pixels */
297
    for(i=0;i<8;i++) {
298
        block[0] = s1[0] - s2[0];
299
        block[1] = s1[1] - s2[1];
300
        block[2] = s1[2] - s2[2];
301
        block[3] = s1[3] - s2[3];
302
        block[4] = s1[4] - s2[4];
303
        block[5] = s1[5] - s2[5];
304
        block[6] = s1[6] - s2[6];
305
        block[7] = s1[7] - s2[7];
306
        s1 += stride;
307
        s2 += stride;
308
        block += 8;
309
    }
310
}
311

    
312

    
313
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314
                                 int line_size)
315
{
316
    int i;
317
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
    
319
    /* read the pixels */
320
    for(i=0;i<8;i++) {
321
        pixels[0] = cm[block[0]];
322
        pixels[1] = cm[block[1]];
323
        pixels[2] = cm[block[2]];
324
        pixels[3] = cm[block[3]];
325
        pixels[4] = cm[block[4]];
326
        pixels[5] = cm[block[5]];
327
        pixels[6] = cm[block[6]];
328
        pixels[7] = cm[block[7]];
329

    
330
        pixels += line_size;
331
        block += 8;
332
    }
333
}
334

    
335
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
336
                          int line_size)
337
{
338
    int i;
339
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
340
    
341
    /* read the pixels */
342
    for(i=0;i<8;i++) {
343
        pixels[0] = cm[pixels[0] + block[0]];
344
        pixels[1] = cm[pixels[1] + block[1]];
345
        pixels[2] = cm[pixels[2] + block[2]];
346
        pixels[3] = cm[pixels[3] + block[3]];
347
        pixels[4] = cm[pixels[4] + block[4]];
348
        pixels[5] = cm[pixels[5] + block[5]];
349
        pixels[6] = cm[pixels[6] + block[6]];
350
        pixels[7] = cm[pixels[7] + block[7]];
351
        pixels += line_size;
352
        block += 8;
353
    }
354
}
355
#if 0
356

357
#define PIXOP2(OPNAME, OP) \
358
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
359
{\
360
    int i;\
361
    for(i=0; i<h; i++){\
362
        OP(*((uint64_t*)block), LD64(pixels));\
363
        pixels+=line_size;\
364
        block +=line_size;\
365
    }\
366
}\
367
\
368
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
369
{\
370
    int i;\
371
    for(i=0; i<h; i++){\
372
        const uint64_t a= LD64(pixels  );\
373
        const uint64_t b= LD64(pixels+1);\
374
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
375
        pixels+=line_size;\
376
        block +=line_size;\
377
    }\
378
}\
379
\
380
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
381
{\
382
    int i;\
383
    for(i=0; i<h; i++){\
384
        const uint64_t a= LD64(pixels  );\
385
        const uint64_t b= LD64(pixels+1);\
386
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
387
        pixels+=line_size;\
388
        block +=line_size;\
389
    }\
390
}\
391
\
392
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
393
{\
394
    int i;\
395
    for(i=0; i<h; i++){\
396
        const uint64_t a= LD64(pixels          );\
397
        const uint64_t b= LD64(pixels+line_size);\
398
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
399
        pixels+=line_size;\
400
        block +=line_size;\
401
    }\
402
}\
403
\
404
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405
{\
406
    int i;\
407
    for(i=0; i<h; i++){\
408
        const uint64_t a= LD64(pixels          );\
409
        const uint64_t b= LD64(pixels+line_size);\
410
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
411
        pixels+=line_size;\
412
        block +=line_size;\
413
    }\
414
}\
415
\
416
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
417
{\
418
        int i;\
419
        const uint64_t a= LD64(pixels  );\
420
        const uint64_t b= LD64(pixels+1);\
421
        uint64_t l0=  (a&0x0303030303030303ULL)\
422
                    + (b&0x0303030303030303ULL)\
423
                    + 0x0202020202020202ULL;\
424
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
425
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
426
        uint64_t l1,h1;\
427
\
428
        pixels+=line_size;\
429
        for(i=0; i<h; i+=2){\
430
            uint64_t a= LD64(pixels  );\
431
            uint64_t b= LD64(pixels+1);\
432
            l1=  (a&0x0303030303030303ULL)\
433
               + (b&0x0303030303030303ULL);\
434
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
435
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
436
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
437
            pixels+=line_size;\
438
            block +=line_size;\
439
            a= LD64(pixels  );\
440
            b= LD64(pixels+1);\
441
            l0=  (a&0x0303030303030303ULL)\
442
               + (b&0x0303030303030303ULL)\
443
               + 0x0202020202020202ULL;\
444
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
445
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
446
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
447
            pixels+=line_size;\
448
            block +=line_size;\
449
        }\
450
}\
451
\
452
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
453
{\
454
        int i;\
455
        const uint64_t a= LD64(pixels  );\
456
        const uint64_t b= LD64(pixels+1);\
457
        uint64_t l0=  (a&0x0303030303030303ULL)\
458
                    + (b&0x0303030303030303ULL)\
459
                    + 0x0101010101010101ULL;\
460
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
461
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
462
        uint64_t l1,h1;\
463
\
464
        pixels+=line_size;\
465
        for(i=0; i<h; i+=2){\
466
            uint64_t a= LD64(pixels  );\
467
            uint64_t b= LD64(pixels+1);\
468
            l1=  (a&0x0303030303030303ULL)\
469
               + (b&0x0303030303030303ULL);\
470
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
471
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
472
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
473
            pixels+=line_size;\
474
            block +=line_size;\
475
            a= LD64(pixels  );\
476
            b= LD64(pixels+1);\
477
            l0=  (a&0x0303030303030303ULL)\
478
               + (b&0x0303030303030303ULL)\
479
               + 0x0101010101010101ULL;\
480
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
481
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
482
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
483
            pixels+=line_size;\
484
            block +=line_size;\
485
        }\
486
}\
487
\
488
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
489
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
490
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
491
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
492
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
493
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
494
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
495

496
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
497
#else // 64 bit variant
498

    
499
#define PIXOP2(OPNAME, OP) \
500
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
501
    int i;\
502
    for(i=0; i<h; i++){\
503
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
504
        pixels+=line_size;\
505
        block +=line_size;\
506
    }\
507
}\
508
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
509
    int i;\
510
    for(i=0; i<h; i++){\
511
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
512
        pixels+=line_size;\
513
        block +=line_size;\
514
    }\
515
}\
516
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
517
    int i;\
518
    for(i=0; i<h; i++){\
519
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
520
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
521
        pixels+=line_size;\
522
        block +=line_size;\
523
    }\
524
}\
525
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
527
}\
528
\
529
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
                                                int src_stride1, int src_stride2, int h){\
531
    int i;\
532
    for(i=0; i<h; i++){\
533
        uint32_t a,b;\
534
        a= LD32(&src1[i*src_stride1  ]);\
535
        b= LD32(&src2[i*src_stride2  ]);\
536
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
537
        a= LD32(&src1[i*src_stride1+4]);\
538
        b= LD32(&src2[i*src_stride2+4]);\
539
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
540
    }\
541
}\
542
\
543
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544
                                                int src_stride1, int src_stride2, int h){\
545
    int i;\
546
    for(i=0; i<h; i++){\
547
        uint32_t a,b;\
548
        a= LD32(&src1[i*src_stride1  ]);\
549
        b= LD32(&src2[i*src_stride2  ]);\
550
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
551
        a= LD32(&src1[i*src_stride1+4]);\
552
        b= LD32(&src2[i*src_stride2+4]);\
553
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
554
    }\
555
}\
556
\
557
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
558
                                                int src_stride1, int src_stride2, int h){\
559
    int i;\
560
    for(i=0; i<h; i++){\
561
        uint32_t a,b;\
562
        a= LD32(&src1[i*src_stride1  ]);\
563
        b= LD32(&src2[i*src_stride2  ]);\
564
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
565
    }\
566
}\
567
\
568
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
569
                                                int src_stride1, int src_stride2, int h){\
570
    int i;\
571
    for(i=0; i<h; i++){\
572
        uint32_t a,b;\
573
        a= LD16(&src1[i*src_stride1  ]);\
574
        b= LD16(&src2[i*src_stride2  ]);\
575
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
576
    }\
577
}\
578
\
579
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
580
                                                int src_stride1, int src_stride2, int h){\
581
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
582
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
583
}\
584
\
585
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
586
                                                int src_stride1, int src_stride2, int h){\
587
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
588
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
589
}\
590
\
591
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
592
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
593
}\
594
\
595
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
596
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
597
}\
598
\
599
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
600
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
601
}\
602
\
603
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
604
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
605
}\
606
\
607
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
608
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
609
    int i;\
610
    for(i=0; i<h; i++){\
611
        uint32_t a, b, c, d, l0, l1, h0, h1;\
612
        a= LD32(&src1[i*src_stride1]);\
613
        b= LD32(&src2[i*src_stride2]);\
614
        c= LD32(&src3[i*src_stride3]);\
615
        d= LD32(&src4[i*src_stride4]);\
616
        l0=  (a&0x03030303UL)\
617
           + (b&0x03030303UL)\
618
           + 0x02020202UL;\
619
        h0= ((a&0xFCFCFCFCUL)>>2)\
620
          + ((b&0xFCFCFCFCUL)>>2);\
621
        l1=  (c&0x03030303UL)\
622
           + (d&0x03030303UL);\
623
        h1= ((c&0xFCFCFCFCUL)>>2)\
624
          + ((d&0xFCFCFCFCUL)>>2);\
625
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626
        a= LD32(&src1[i*src_stride1+4]);\
627
        b= LD32(&src2[i*src_stride2+4]);\
628
        c= LD32(&src3[i*src_stride3+4]);\
629
        d= LD32(&src4[i*src_stride4+4]);\
630
        l0=  (a&0x03030303UL)\
631
           + (b&0x03030303UL)\
632
           + 0x02020202UL;\
633
        h0= ((a&0xFCFCFCFCUL)>>2)\
634
          + ((b&0xFCFCFCFCUL)>>2);\
635
        l1=  (c&0x03030303UL)\
636
           + (d&0x03030303UL);\
637
        h1= ((c&0xFCFCFCFCUL)>>2)\
638
          + ((d&0xFCFCFCFCUL)>>2);\
639
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
640
    }\
641
}\
642
\
643
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
645
}\
646
\
647
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
648
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
649
}\
650
\
651
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
653
}\
654
\
655
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
656
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
657
}\
658
\
659
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
660
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
661
    int i;\
662
    for(i=0; i<h; i++){\
663
        uint32_t a, b, c, d, l0, l1, h0, h1;\
664
        a= LD32(&src1[i*src_stride1]);\
665
        b= LD32(&src2[i*src_stride2]);\
666
        c= LD32(&src3[i*src_stride3]);\
667
        d= LD32(&src4[i*src_stride4]);\
668
        l0=  (a&0x03030303UL)\
669
           + (b&0x03030303UL)\
670
           + 0x01010101UL;\
671
        h0= ((a&0xFCFCFCFCUL)>>2)\
672
          + ((b&0xFCFCFCFCUL)>>2);\
673
        l1=  (c&0x03030303UL)\
674
           + (d&0x03030303UL);\
675
        h1= ((c&0xFCFCFCFCUL)>>2)\
676
          + ((d&0xFCFCFCFCUL)>>2);\
677
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678
        a= LD32(&src1[i*src_stride1+4]);\
679
        b= LD32(&src2[i*src_stride2+4]);\
680
        c= LD32(&src3[i*src_stride3+4]);\
681
        d= LD32(&src4[i*src_stride4+4]);\
682
        l0=  (a&0x03030303UL)\
683
           + (b&0x03030303UL)\
684
           + 0x01010101UL;\
685
        h0= ((a&0xFCFCFCFCUL)>>2)\
686
          + ((b&0xFCFCFCFCUL)>>2);\
687
        l1=  (c&0x03030303UL)\
688
           + (d&0x03030303UL);\
689
        h1= ((c&0xFCFCFCFCUL)>>2)\
690
          + ((d&0xFCFCFCFCUL)>>2);\
691
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
692
    }\
693
}\
694
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
695
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
696
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698
}\
699
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
700
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
701
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
703
}\
704
\
705
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706
{\
707
        int i, a0, b0, a1, b1;\
708
        a0= pixels[0];\
709
        b0= pixels[1] + 2;\
710
        a0 += b0;\
711
        b0 += pixels[2];\
712
\
713
        pixels+=line_size;\
714
        for(i=0; i<h; i+=2){\
715
            a1= pixels[0];\
716
            b1= pixels[1];\
717
            a1 += b1;\
718
            b1 += pixels[2];\
719
\
720
            block[0]= (a1+a0)>>2; /* FIXME non put */\
721
            block[1]= (b1+b0)>>2;\
722
\
723
            pixels+=line_size;\
724
            block +=line_size;\
725
\
726
            a0= pixels[0];\
727
            b0= pixels[1] + 2;\
728
            a0 += b0;\
729
            b0 += pixels[2];\
730
\
731
            block[0]= (a1+a0)>>2;\
732
            block[1]= (b1+b0)>>2;\
733
            pixels+=line_size;\
734
            block +=line_size;\
735
        }\
736
}\
737
\
738
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
739
{\
740
        int i;\
741
        const uint32_t a= LD32(pixels  );\
742
        const uint32_t b= LD32(pixels+1);\
743
        uint32_t l0=  (a&0x03030303UL)\
744
                    + (b&0x03030303UL)\
745
                    + 0x02020202UL;\
746
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
747
                   + ((b&0xFCFCFCFCUL)>>2);\
748
        uint32_t l1,h1;\
749
\
750
        pixels+=line_size;\
751
        for(i=0; i<h; i+=2){\
752
            uint32_t a= LD32(pixels  );\
753
            uint32_t b= LD32(pixels+1);\
754
            l1=  (a&0x03030303UL)\
755
               + (b&0x03030303UL);\
756
            h1= ((a&0xFCFCFCFCUL)>>2)\
757
              + ((b&0xFCFCFCFCUL)>>2);\
758
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
759
            pixels+=line_size;\
760
            block +=line_size;\
761
            a= LD32(pixels  );\
762
            b= LD32(pixels+1);\
763
            l0=  (a&0x03030303UL)\
764
               + (b&0x03030303UL)\
765
               + 0x02020202UL;\
766
            h0= ((a&0xFCFCFCFCUL)>>2)\
767
              + ((b&0xFCFCFCFCUL)>>2);\
768
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
769
            pixels+=line_size;\
770
            block +=line_size;\
771
        }\
772
}\
773
\
774
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
775
{\
776
    int j;\
777
    for(j=0; j<2; j++){\
778
        int i;\
779
        const uint32_t a= LD32(pixels  );\
780
        const uint32_t b= LD32(pixels+1);\
781
        uint32_t l0=  (a&0x03030303UL)\
782
                    + (b&0x03030303UL)\
783
                    + 0x02020202UL;\
784
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
785
                   + ((b&0xFCFCFCFCUL)>>2);\
786
        uint32_t l1,h1;\
787
\
788
        pixels+=line_size;\
789
        for(i=0; i<h; i+=2){\
790
            uint32_t a= LD32(pixels  );\
791
            uint32_t b= LD32(pixels+1);\
792
            l1=  (a&0x03030303UL)\
793
               + (b&0x03030303UL);\
794
            h1= ((a&0xFCFCFCFCUL)>>2)\
795
              + ((b&0xFCFCFCFCUL)>>2);\
796
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
797
            pixels+=line_size;\
798
            block +=line_size;\
799
            a= LD32(pixels  );\
800
            b= LD32(pixels+1);\
801
            l0=  (a&0x03030303UL)\
802
               + (b&0x03030303UL)\
803
               + 0x02020202UL;\
804
            h0= ((a&0xFCFCFCFCUL)>>2)\
805
              + ((b&0xFCFCFCFCUL)>>2);\
806
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
807
            pixels+=line_size;\
808
            block +=line_size;\
809
        }\
810
        pixels+=4-line_size*(h+1);\
811
        block +=4-line_size*h;\
812
    }\
813
}\
814
\
815
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
816
{\
817
    int j;\
818
    for(j=0; j<2; j++){\
819
        int i;\
820
        const uint32_t a= LD32(pixels  );\
821
        const uint32_t b= LD32(pixels+1);\
822
        uint32_t l0=  (a&0x03030303UL)\
823
                    + (b&0x03030303UL)\
824
                    + 0x01010101UL;\
825
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
826
                   + ((b&0xFCFCFCFCUL)>>2);\
827
        uint32_t l1,h1;\
828
\
829
        pixels+=line_size;\
830
        for(i=0; i<h; i+=2){\
831
            uint32_t a= LD32(pixels  );\
832
            uint32_t b= LD32(pixels+1);\
833
            l1=  (a&0x03030303UL)\
834
               + (b&0x03030303UL);\
835
            h1= ((a&0xFCFCFCFCUL)>>2)\
836
              + ((b&0xFCFCFCFCUL)>>2);\
837
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
838
            pixels+=line_size;\
839
            block +=line_size;\
840
            a= LD32(pixels  );\
841
            b= LD32(pixels+1);\
842
            l0=  (a&0x03030303UL)\
843
               + (b&0x03030303UL)\
844
               + 0x01010101UL;\
845
            h0= ((a&0xFCFCFCFCUL)>>2)\
846
              + ((b&0xFCFCFCFCUL)>>2);\
847
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
848
            pixels+=line_size;\
849
            block +=line_size;\
850
        }\
851
        pixels+=4-line_size*(h+1);\
852
        block +=4-line_size*h;\
853
    }\
854
}\
855
\
856
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
857
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
858
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
859
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
860
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
861
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
862
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
863
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
864

    
865
#define op_avg(a, b) a = rnd_avg32(a, b)
866
#endif
867
#define op_put(a, b) a = b
868

    
869
PIXOP2(avg, op_avg)
870
PIXOP2(put, op_put)
871
#undef op_avg
872
#undef op_put
873

    
874
#define avg2(a,b) ((a+b+1)>>1)
875
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
876

    
877
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
878
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
879
}
880

    
881
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
882
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
883
}
884

    
885
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
886
{
887
    const int A=(16-x16)*(16-y16);
888
    const int B=(   x16)*(16-y16);
889
    const int C=(16-x16)*(   y16);
890
    const int D=(   x16)*(   y16);
891
    int i;
892

    
893
    for(i=0; i<h; i++)
894
    {
895
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
896
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
897
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
898
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
899
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
900
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
901
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
902
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
903
        dst+= stride;
904
        src+= stride;
905
    }
906
}
907

    
908
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
909
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
910
{
911
    int y, vx, vy;
912
    const int s= 1<<shift;
913
    
914
    width--;
915
    height--;
916

    
917
    for(y=0; y<h; y++){
918
        int x;
919

    
920
        vx= ox;
921
        vy= oy;
922
        for(x=0; x<8; x++){ //XXX FIXME optimize
923
            int src_x, src_y, frac_x, frac_y, index;
924

    
925
            src_x= vx>>16;
926
            src_y= vy>>16;
927
            frac_x= src_x&(s-1);
928
            frac_y= src_y&(s-1);
929
            src_x>>=shift;
930
            src_y>>=shift;
931
  
932
            if((unsigned)src_x < width){
933
                if((unsigned)src_y < height){
934
                    index= src_x + src_y*stride;
935
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
936
                                           + src[index       +1]*   frac_x )*(s-frac_y)
937
                                        + (  src[index+stride  ]*(s-frac_x)
938
                                           + src[index+stride+1]*   frac_x )*   frac_y
939
                                        + r)>>(shift*2);
940
                }else{
941
                    index= src_x + clip(src_y, 0, height)*stride;                    
942
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
943
                                          + src[index       +1]*   frac_x )*s
944
                                        + r)>>(shift*2);
945
                }
946
            }else{
947
                if((unsigned)src_y < height){
948
                    index= clip(src_x, 0, width) + src_y*stride;                    
949
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
950
                                           + src[index+stride  ]*   frac_y )*s
951
                                        + r)>>(shift*2);
952
                }else{
953
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
954
                    dst[y*stride + x]=    src[index         ];
955
                }
956
            }
957
            
958
            vx+= dxx;
959
            vy+= dyx;
960
        }
961
        ox += dxy;
962
        oy += dyy;
963
    }
964
}
965

    
966
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
967
    switch(width){
968
    case 2: put_pixels2_c (dst, src, stride, height); break;
969
    case 4: put_pixels4_c (dst, src, stride, height); break;
970
    case 8: put_pixels8_c (dst, src, stride, height); break;
971
    case 16:put_pixels16_c(dst, src, stride, height); break;
972
    }
973
}
974

    
975
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
976
    int i,j;
977
    for (i=0; i < height; i++) {
978
      for (j=0; j < width; j++) {
979
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
980
      }
981
      src += stride;
982
      dst += stride;
983
    }
984
}
985

    
986
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
987
    int i,j;
988
    for (i=0; i < height; i++) {
989
      for (j=0; j < width; j++) {
990
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
991
      }
992
      src += stride;
993
      dst += stride;
994
    }
995
}
996
    
997
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
998
    int i,j;
999
    for (i=0; i < height; i++) {
1000
      for (j=0; j < width; j++) {
1001
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1002
      }
1003
      src += stride;
1004
      dst += stride;
1005
    }
1006
}
1007
    
1008
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1009
    int i,j;
1010
    for (i=0; i < height; i++) {
1011
      for (j=0; j < width; j++) {
1012
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1013
      }
1014
      src += stride;
1015
      dst += stride;
1016
    }
1017
}
1018

    
1019
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1020
    int i,j;
1021
    for (i=0; i < height; i++) {
1022
      for (j=0; j < width; j++) {
1023
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1024
      }
1025
      src += stride;
1026
      dst += stride;
1027
    }
1028
}
1029

    
1030
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1031
    int i,j;
1032
    for (i=0; i < height; i++) {
1033
      for (j=0; j < width; j++) {
1034
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1035
      }
1036
      src += stride;
1037
      dst += stride;
1038
    }
1039
}
1040

    
1041
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1042
    int i,j;
1043
    for (i=0; i < height; i++) {
1044
      for (j=0; j < width; j++) {
1045
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1046
      }
1047
      src += stride;
1048
      dst += stride;
1049
    }
1050
}
1051

    
1052
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1053
    int i,j;
1054
    for (i=0; i < height; i++) {
1055
      for (j=0; j < width; j++) {
1056
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1057
      }
1058
      src += stride;
1059
      dst += stride;
1060
    }
1061
}
1062

    
1063
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1064
    switch(width){
1065
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1066
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1067
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1068
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1069
    }
1070
}
1071

    
1072
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1073
    int i,j;
1074
    for (i=0; i < height; i++) {
1075
      for (j=0; j < width; j++) {
1076
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1077
      }
1078
      src += stride;
1079
      dst += stride;
1080
    }
1081
}
1082

    
1083
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1084
    int i,j;
1085
    for (i=0; i < height; i++) {
1086
      for (j=0; j < width; j++) {
1087
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1088
      }
1089
      src += stride;
1090
      dst += stride;
1091
    }
1092
}
1093
    
1094
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1095
    int i,j;
1096
    for (i=0; i < height; i++) {
1097
      for (j=0; j < width; j++) {
1098
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1099
      }
1100
      src += stride;
1101
      dst += stride;
1102
    }
1103
}
1104
    
1105
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1106
    int i,j;
1107
    for (i=0; i < height; i++) {
1108
      for (j=0; j < width; j++) {
1109
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1110
      }
1111
      src += stride;
1112
      dst += stride;
1113
    }
1114
}
1115

    
1116
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1117
    int i,j;
1118
    for (i=0; i < height; i++) {
1119
      for (j=0; j < width; j++) {
1120
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1121
      }
1122
      src += stride;
1123
      dst += stride;
1124
    }
1125
}
1126

    
1127
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1128
    int i,j;
1129
    for (i=0; i < height; i++) {
1130
      for (j=0; j < width; j++) {
1131
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1132
      }
1133
      src += stride;
1134
      dst += stride;
1135
    }
1136
}
1137

    
1138
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1139
    int i,j;
1140
    for (i=0; i < height; i++) {
1141
      for (j=0; j < width; j++) {
1142
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1143
      }
1144
      src += stride;
1145
      dst += stride;
1146
    }
1147
}
1148

    
1149
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1150
    int i,j;
1151
    for (i=0; i < height; i++) {
1152
      for (j=0; j < width; j++) {
1153
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1154
      }
1155
      src += stride;
1156
      dst += stride;
1157
    }
1158
}
1159
#if 0
1160
#define TPEL_WIDTH(width)\
1161
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1163
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1165
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1167
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1169
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1171
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1172
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1173
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1174
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1175
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1176
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1177
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1178
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1179
#endif
1180

    
1181
#define H264_CHROMA_MC(OPNAME, OP)\
1182
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1183
    const int A=(8-x)*(8-y);\
1184
    const int B=(  x)*(8-y);\
1185
    const int C=(8-x)*(  y);\
1186
    const int D=(  x)*(  y);\
1187
    int i;\
1188
    \
1189
    assert(x<8 && y<8 && x>=0 && y>=0);\
1190
\
1191
    for(i=0; i<h; i++)\
1192
    {\
1193
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1194
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1195
        dst+= stride;\
1196
        src+= stride;\
1197
    }\
1198
}\
1199
\
1200
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1201
    const int A=(8-x)*(8-y);\
1202
    const int B=(  x)*(8-y);\
1203
    const int C=(8-x)*(  y);\
1204
    const int D=(  x)*(  y);\
1205
    int i;\
1206
    \
1207
    assert(x<8 && y<8 && x>=0 && y>=0);\
1208
\
1209
    for(i=0; i<h; i++)\
1210
    {\
1211
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1212
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1213
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1214
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1215
        dst+= stride;\
1216
        src+= stride;\
1217
    }\
1218
}\
1219
\
1220
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1221
    const int A=(8-x)*(8-y);\
1222
    const int B=(  x)*(8-y);\
1223
    const int C=(8-x)*(  y);\
1224
    const int D=(  x)*(  y);\
1225
    int i;\
1226
    \
1227
    assert(x<8 && y<8 && x>=0 && y>=0);\
1228
\
1229
    for(i=0; i<h; i++)\
1230
    {\
1231
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1232
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1233
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1234
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1235
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1236
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1237
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1238
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1239
        dst+= stride;\
1240
        src+= stride;\
1241
    }\
1242
}
1243

    
1244
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1245
#define op_put(a, b) a = (((b) + 32)>>6)
1246

    
1247
H264_CHROMA_MC(put_       , op_put)
1248
H264_CHROMA_MC(avg_       , op_avg)
1249
#undef op_avg
1250
#undef op_put
1251

    
1252
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1253
{
1254
    int i;
1255
    for(i=0; i<h; i++)
1256
    {
1257
        ST32(dst   , LD32(src   ));
1258
        dst+=dstStride;
1259
        src+=srcStride;
1260
    }
1261
}
1262

    
1263
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1264
{
1265
    int i;
1266
    for(i=0; i<h; i++)
1267
    {
1268
        ST32(dst   , LD32(src   ));
1269
        ST32(dst+4 , LD32(src+4 ));
1270
        dst+=dstStride;
1271
        src+=srcStride;
1272
    }
1273
}
1274

    
1275
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1276
{
1277
    int i;
1278
    for(i=0; i<h; i++)
1279
    {
1280
        ST32(dst   , LD32(src   ));
1281
        ST32(dst+4 , LD32(src+4 ));
1282
        ST32(dst+8 , LD32(src+8 ));
1283
        ST32(dst+12, LD32(src+12));
1284
        dst+=dstStride;
1285
        src+=srcStride;
1286
    }
1287
}
1288

    
1289
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1290
{
1291
    int i;
1292
    for(i=0; i<h; i++)
1293
    {
1294
        ST32(dst   , LD32(src   ));
1295
        ST32(dst+4 , LD32(src+4 ));
1296
        ST32(dst+8 , LD32(src+8 ));
1297
        ST32(dst+12, LD32(src+12));
1298
        dst[16]= src[16];
1299
        dst+=dstStride;
1300
        src+=srcStride;
1301
    }
1302
}
1303

    
1304
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1305
{
1306
    int i;
1307
    for(i=0; i<h; i++)
1308
    {
1309
        ST32(dst   , LD32(src   ));
1310
        ST32(dst+4 , LD32(src+4 ));
1311
        dst[8]= src[8];
1312
        dst+=dstStride;
1313
        src+=srcStride;
1314
    }
1315
}
1316

    
1317

    
1318
#define QPEL_MC(r, OPNAME, RND, OP) \
1319
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1320
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1321
    int i;\
1322
    for(i=0; i<h; i++)\
1323
    {\
1324
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1325
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1326
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1327
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1328
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1329
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1330
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1331
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1332
        dst+=dstStride;\
1333
        src+=srcStride;\
1334
    }\
1335
}\
1336
\
1337
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1338
    const int w=8;\
1339
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1340
    int i;\
1341
    for(i=0; i<w; i++)\
1342
    {\
1343
        const int src0= src[0*srcStride];\
1344
        const int src1= src[1*srcStride];\
1345
        const int src2= src[2*srcStride];\
1346
        const int src3= src[3*srcStride];\
1347
        const int src4= src[4*srcStride];\
1348
        const int src5= src[5*srcStride];\
1349
        const int src6= src[6*srcStride];\
1350
        const int src7= src[7*srcStride];\
1351
        const int src8= src[8*srcStride];\
1352
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1353
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1354
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1355
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1356
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1357
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1358
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1359
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1360
        dst++;\
1361
        src++;\
1362
    }\
1363
}\
1364
\
1365
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1366
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1367
    int i;\
1368
    \
1369
    for(i=0; i<h; i++)\
1370
    {\
1371
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1372
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1373
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1374
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1375
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1376
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1377
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1378
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1379
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1380
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1381
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1382
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1383
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1384
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1385
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1386
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1387
        dst+=dstStride;\
1388
        src+=srcStride;\
1389
    }\
1390
}\
1391
\
1392
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1393
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1394
    int i;\
1395
    const int w=16;\
1396
    for(i=0; i<w; i++)\
1397
    {\
1398
        const int src0= src[0*srcStride];\
1399
        const int src1= src[1*srcStride];\
1400
        const int src2= src[2*srcStride];\
1401
        const int src3= src[3*srcStride];\
1402
        const int src4= src[4*srcStride];\
1403
        const int src5= src[5*srcStride];\
1404
        const int src6= src[6*srcStride];\
1405
        const int src7= src[7*srcStride];\
1406
        const int src8= src[8*srcStride];\
1407
        const int src9= src[9*srcStride];\
1408
        const int src10= src[10*srcStride];\
1409
        const int src11= src[11*srcStride];\
1410
        const int src12= src[12*srcStride];\
1411
        const int src13= src[13*srcStride];\
1412
        const int src14= src[14*srcStride];\
1413
        const int src15= src[15*srcStride];\
1414
        const int src16= src[16*srcStride];\
1415
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1416
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1417
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1418
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1419
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1420
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1421
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1422
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1423
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1424
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1425
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1426
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1427
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1428
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1429
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1430
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1431
        dst++;\
1432
        src++;\
1433
    }\
1434
}\
1435
\
1436
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1437
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1438
}\
1439
\
1440
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1441
    uint8_t half[64];\
1442
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1443
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1444
}\
1445
\
1446
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1447
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1448
}\
1449
\
1450
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1451
    uint8_t half[64];\
1452
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1453
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1454
}\
1455
\
1456
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458
    uint8_t half[64];\
1459
    copy_block9(full, src, 16, stride, 9);\
1460
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1461
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1462
}\
1463
\
1464
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1465
    uint8_t full[16*9];\
1466
    copy_block9(full, src, 16, stride, 9);\
1467
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1468
}\
1469
\
1470
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1471
    uint8_t full[16*9];\
1472
    uint8_t half[64];\
1473
    copy_block9(full, src, 16, stride, 9);\
1474
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1475
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1476
}\
1477
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478
    uint8_t full[16*9];\
1479
    uint8_t halfH[72];\
1480
    uint8_t halfV[64];\
1481
    uint8_t halfHV[64];\
1482
    copy_block9(full, src, 16, stride, 9);\
1483
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1485
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1487
}\
1488
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1489
    uint8_t full[16*9];\
1490
    uint8_t halfH[72];\
1491
    uint8_t halfHV[64];\
1492
    copy_block9(full, src, 16, stride, 9);\
1493
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1495
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1497
}\
1498
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499
    uint8_t full[16*9];\
1500
    uint8_t halfH[72];\
1501
    uint8_t halfV[64];\
1502
    uint8_t halfHV[64];\
1503
    copy_block9(full, src, 16, stride, 9);\
1504
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1506
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508
}\
1509
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1510
    uint8_t full[16*9];\
1511
    uint8_t halfH[72];\
1512
    uint8_t halfHV[64];\
1513
    copy_block9(full, src, 16, stride, 9);\
1514
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1516
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518
}\
1519
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520
    uint8_t full[16*9];\
1521
    uint8_t halfH[72];\
1522
    uint8_t halfV[64];\
1523
    uint8_t halfHV[64];\
1524
    copy_block9(full, src, 16, stride, 9);\
1525
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1527
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529
}\
1530
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1531
    uint8_t full[16*9];\
1532
    uint8_t halfH[72];\
1533
    uint8_t halfHV[64];\
1534
    copy_block9(full, src, 16, stride, 9);\
1535
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1537
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1539
}\
1540
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541
    uint8_t full[16*9];\
1542
    uint8_t halfH[72];\
1543
    uint8_t halfV[64];\
1544
    uint8_t halfHV[64];\
1545
    copy_block9(full, src, 16, stride, 9);\
1546
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1547
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1548
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550
}\
1551
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1552
    uint8_t full[16*9];\
1553
    uint8_t halfH[72];\
1554
    uint8_t halfHV[64];\
1555
    copy_block9(full, src, 16, stride, 9);\
1556
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1558
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560
}\
1561
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1562
    uint8_t halfH[72];\
1563
    uint8_t halfHV[64];\
1564
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1567
}\
1568
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1569
    uint8_t halfH[72];\
1570
    uint8_t halfHV[64];\
1571
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1572
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1573
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1574
}\
1575
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1576
    uint8_t full[16*9];\
1577
    uint8_t halfH[72];\
1578
    uint8_t halfV[64];\
1579
    uint8_t halfHV[64];\
1580
    copy_block9(full, src, 16, stride, 9);\
1581
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1582
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1583
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1584
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1585
}\
1586
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1587
    uint8_t full[16*9];\
1588
    uint8_t halfH[72];\
1589
    copy_block9(full, src, 16, stride, 9);\
1590
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1591
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1592
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1593
}\
1594
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1595
    uint8_t full[16*9];\
1596
    uint8_t halfH[72];\
1597
    uint8_t halfV[64];\
1598
    uint8_t halfHV[64];\
1599
    copy_block9(full, src, 16, stride, 9);\
1600
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1601
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1602
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1603
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1604
}\
1605
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1606
    uint8_t full[16*9];\
1607
    uint8_t halfH[72];\
1608
    copy_block9(full, src, 16, stride, 9);\
1609
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1610
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1611
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1612
}\
1613
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1614
    uint8_t halfH[72];\
1615
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1616
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1617
}\
1618
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1619
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1620
}\
1621
\
1622
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1623
    uint8_t half[256];\
1624
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1625
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1626
}\
1627
\
1628
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1629
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1630
}\
1631
\
1632
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1633
    uint8_t half[256];\
1634
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1635
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1636
}\
1637
\
1638
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640
    uint8_t half[256];\
1641
    copy_block17(full, src, 24, stride, 17);\
1642
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1643
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1644
}\
1645
\
1646
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1647
    uint8_t full[24*17];\
1648
    copy_block17(full, src, 24, stride, 17);\
1649
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1650
}\
1651
\
1652
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1653
    uint8_t full[24*17];\
1654
    uint8_t half[256];\
1655
    copy_block17(full, src, 24, stride, 17);\
1656
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1657
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1658
}\
1659
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660
    uint8_t full[24*17];\
1661
    uint8_t halfH[272];\
1662
    uint8_t halfV[256];\
1663
    uint8_t halfHV[256];\
1664
    copy_block17(full, src, 24, stride, 17);\
1665
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1667
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1669
}\
1670
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1671
    uint8_t full[24*17];\
1672
    uint8_t halfH[272];\
1673
    uint8_t halfHV[256];\
1674
    copy_block17(full, src, 24, stride, 17);\
1675
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1677
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1679
}\
1680
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681
    uint8_t full[24*17];\
1682
    uint8_t halfH[272];\
1683
    uint8_t halfV[256];\
1684
    uint8_t halfHV[256];\
1685
    copy_block17(full, src, 24, stride, 17);\
1686
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1688
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690
}\
1691
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1692
    uint8_t full[24*17];\
1693
    uint8_t halfH[272];\
1694
    uint8_t halfHV[256];\
1695
    copy_block17(full, src, 24, stride, 17);\
1696
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1698
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700
}\
1701
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702
    uint8_t full[24*17];\
1703
    uint8_t halfH[272];\
1704
    uint8_t halfV[256];\
1705
    uint8_t halfHV[256];\
1706
    copy_block17(full, src, 24, stride, 17);\
1707
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1709
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711
}\
1712
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[24*17];\
1714
    uint8_t halfH[272];\
1715
    uint8_t halfHV[256];\
1716
    copy_block17(full, src, 24, stride, 17);\
1717
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1719
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1721
}\
1722
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723
    uint8_t full[24*17];\
1724
    uint8_t halfH[272];\
1725
    uint8_t halfV[256];\
1726
    uint8_t halfHV[256];\
1727
    copy_block17(full, src, 24, stride, 17);\
1728
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1729
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1730
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732
}\
1733
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1734
    uint8_t full[24*17];\
1735
    uint8_t halfH[272];\
1736
    uint8_t halfHV[256];\
1737
    copy_block17(full, src, 24, stride, 17);\
1738
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1740
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742
}\
1743
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1744
    uint8_t halfH[272];\
1745
    uint8_t halfHV[256];\
1746
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1749
}\
1750
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1751
    uint8_t halfH[272];\
1752
    uint8_t halfHV[256];\
1753
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1754
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1755
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1756
}\
1757
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758
    uint8_t full[24*17];\
1759
    uint8_t halfH[272];\
1760
    uint8_t halfV[256];\
1761
    uint8_t halfHV[256];\
1762
    copy_block17(full, src, 24, stride, 17);\
1763
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1764
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1765
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1766
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1767
}\
1768
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[24*17];\
1770
    uint8_t halfH[272];\
1771
    copy_block17(full, src, 24, stride, 17);\
1772
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1773
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1774
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1775
}\
1776
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777
    uint8_t full[24*17];\
1778
    uint8_t halfH[272];\
1779
    uint8_t halfV[256];\
1780
    uint8_t halfHV[256];\
1781
    copy_block17(full, src, 24, stride, 17);\
1782
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1783
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1784
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1785
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1786
}\
1787
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1788
    uint8_t full[24*17];\
1789
    uint8_t halfH[272];\
1790
    copy_block17(full, src, 24, stride, 17);\
1791
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1792
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1793
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1794
}\
1795
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1796
    uint8_t halfH[272];\
1797
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1798
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1799
}
1800

    
1801
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1802
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1803
#define op_put(a, b) a = cm[((b) + 16)>>5]
1804
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1805

    
1806
QPEL_MC(0, put_       , _       , op_put)
1807
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1808
QPEL_MC(0, avg_       , _       , op_avg)
1809
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1810
#undef op_avg
1811
#undef op_avg_no_rnd
1812
#undef op_put
1813
#undef op_put_no_rnd
1814

    
1815
#if 1
1816
#define H264_LOWPASS(OPNAME, OP, OP2) \
1817
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1818
    const int h=4;\
1819
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1820
    int i;\
1821
    for(i=0; i<h; i++)\
1822
    {\
1823
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1824
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1825
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1826
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1827
        dst+=dstStride;\
1828
        src+=srcStride;\
1829
    }\
1830
}\
1831
\
1832
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1833
    const int w=4;\
1834
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1835
    int i;\
1836
    for(i=0; i<w; i++)\
1837
    {\
1838
        const int srcB= src[-2*srcStride];\
1839
        const int srcA= src[-1*srcStride];\
1840
        const int src0= src[0 *srcStride];\
1841
        const int src1= src[1 *srcStride];\
1842
        const int src2= src[2 *srcStride];\
1843
        const int src3= src[3 *srcStride];\
1844
        const int src4= src[4 *srcStride];\
1845
        const int src5= src[5 *srcStride];\
1846
        const int src6= src[6 *srcStride];\
1847
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1848
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1849
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1850
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1851
        dst++;\
1852
        src++;\
1853
    }\
1854
}\
1855
\
1856
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1857
    const int h=4;\
1858
    const int w=4;\
1859
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1860
    int i;\
1861
    src -= 2*srcStride;\
1862
    for(i=0; i<h+5; i++)\
1863
    {\
1864
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1865
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1866
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1867
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1868
        tmp+=tmpStride;\
1869
        src+=srcStride;\
1870
    }\
1871
    tmp -= tmpStride*(h+5-2);\
1872
    for(i=0; i<w; i++)\
1873
    {\
1874
        const int tmpB= tmp[-2*tmpStride];\
1875
        const int tmpA= tmp[-1*tmpStride];\
1876
        const int tmp0= tmp[0 *tmpStride];\
1877
        const int tmp1= tmp[1 *tmpStride];\
1878
        const int tmp2= tmp[2 *tmpStride];\
1879
        const int tmp3= tmp[3 *tmpStride];\
1880
        const int tmp4= tmp[4 *tmpStride];\
1881
        const int tmp5= tmp[5 *tmpStride];\
1882
        const int tmp6= tmp[6 *tmpStride];\
1883
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1884
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1885
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1886
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1887
        dst++;\
1888
        tmp++;\
1889
    }\
1890
}\
1891
\
1892
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1893
    const int h=8;\
1894
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1895
    int i;\
1896
    for(i=0; i<h; i++)\
1897
    {\
1898
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1899
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1900
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1901
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1902
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1903
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1904
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1905
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1906
        dst+=dstStride;\
1907
        src+=srcStride;\
1908
    }\
1909
}\
1910
\
1911
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1912
    const int w=8;\
1913
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1914
    int i;\
1915
    for(i=0; i<w; i++)\
1916
    {\
1917
        const int srcB= src[-2*srcStride];\
1918
        const int srcA= src[-1*srcStride];\
1919
        const int src0= src[0 *srcStride];\
1920
        const int src1= src[1 *srcStride];\
1921
        const int src2= src[2 *srcStride];\
1922
        const int src3= src[3 *srcStride];\
1923
        const int src4= src[4 *srcStride];\
1924
        const int src5= src[5 *srcStride];\
1925
        const int src6= src[6 *srcStride];\
1926
        const int src7= src[7 *srcStride];\
1927
        const int src8= src[8 *srcStride];\
1928
        const int src9= src[9 *srcStride];\
1929
        const int src10=src[10*srcStride];\
1930
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1931
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1932
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1933
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1934
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1935
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1936
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1937
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1938
        dst++;\
1939
        src++;\
1940
    }\
1941
}\
1942
\
1943
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1944
    const int h=8;\
1945
    const int w=8;\
1946
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1947
    int i;\
1948
    src -= 2*srcStride;\
1949
    for(i=0; i<h+5; i++)\
1950
    {\
1951
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1952
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1953
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1954
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1955
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1956
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1957
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1958
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1959
        tmp+=tmpStride;\
1960
        src+=srcStride;\
1961
    }\
1962
    tmp -= tmpStride*(h+5-2);\
1963
    for(i=0; i<w; i++)\
1964
    {\
1965
        const int tmpB= tmp[-2*tmpStride];\
1966
        const int tmpA= tmp[-1*tmpStride];\
1967
        const int tmp0= tmp[0 *tmpStride];\
1968
        const int tmp1= tmp[1 *tmpStride];\
1969
        const int tmp2= tmp[2 *tmpStride];\
1970
        const int tmp3= tmp[3 *tmpStride];\
1971
        const int tmp4= tmp[4 *tmpStride];\
1972
        const int tmp5= tmp[5 *tmpStride];\
1973
        const int tmp6= tmp[6 *tmpStride];\
1974
        const int tmp7= tmp[7 *tmpStride];\
1975
        const int tmp8= tmp[8 *tmpStride];\
1976
        const int tmp9= tmp[9 *tmpStride];\
1977
        const int tmp10=tmp[10*tmpStride];\
1978
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1979
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1980
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1981
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1982
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1983
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1984
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1985
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1986
        dst++;\
1987
        tmp++;\
1988
    }\
1989
}\
1990
\
1991
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1992
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1993
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1994
    src += 8*srcStride;\
1995
    dst += 8*dstStride;\
1996
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1997
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1998
}\
1999
\
2000
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2001
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2002
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2003
    src += 8*srcStride;\
2004
    dst += 8*dstStride;\
2005
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2006
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2007
}\
2008
\
2009
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2010
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2011
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2012
    src += 8*srcStride;\
2013
    tmp += 8*tmpStride;\
2014
    dst += 8*dstStride;\
2015
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2016
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2017
}\
2018

    
2019
#define H264_MC(OPNAME, SIZE) \
2020
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2021
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2022
}\
2023
\
2024
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2025
    uint8_t half[SIZE*SIZE];\
2026
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2027
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2028
}\
2029
\
2030
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2031
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2032
}\
2033
\
2034
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2035
    uint8_t half[SIZE*SIZE];\
2036
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2037
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2038
}\
2039
\
2040
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2041
    uint8_t full[SIZE*(SIZE+5)];\
2042
    uint8_t * const full_mid= full + SIZE*2;\
2043
    uint8_t half[SIZE*SIZE];\
2044
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2046
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2047
}\
2048
\
2049
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2050
    uint8_t full[SIZE*(SIZE+5)];\
2051
    uint8_t * const full_mid= full + SIZE*2;\
2052
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2054
}\
2055
\
2056
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2057
    uint8_t full[SIZE*(SIZE+5)];\
2058
    uint8_t * const full_mid= full + SIZE*2;\
2059
    uint8_t half[SIZE*SIZE];\
2060
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2061
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2062
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2063
}\
2064
\
2065
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2066
    uint8_t full[SIZE*(SIZE+5)];\
2067
    uint8_t * const full_mid= full + SIZE*2;\
2068
    uint8_t halfH[SIZE*SIZE];\
2069
    uint8_t halfV[SIZE*SIZE];\
2070
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2071
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2072
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2073
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2074
}\
2075
\
2076
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2077
    uint8_t full[SIZE*(SIZE+5)];\
2078
    uint8_t * const full_mid= full + SIZE*2;\
2079
    uint8_t halfH[SIZE*SIZE];\
2080
    uint8_t halfV[SIZE*SIZE];\
2081
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2082
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2083
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2084
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2085
}\
2086
\
2087
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2088
    uint8_t full[SIZE*(SIZE+5)];\
2089
    uint8_t * const full_mid= full + SIZE*2;\
2090
    uint8_t halfH[SIZE*SIZE];\
2091
    uint8_t halfV[SIZE*SIZE];\
2092
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2093
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2094
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2095
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2096
}\
2097
\
2098
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2099
    uint8_t full[SIZE*(SIZE+5)];\
2100
    uint8_t * const full_mid= full + SIZE*2;\
2101
    uint8_t halfH[SIZE*SIZE];\
2102
    uint8_t halfV[SIZE*SIZE];\
2103
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2104
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2105
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2106
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2107
}\
2108
\
2109
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2110
    int16_t tmp[SIZE*(SIZE+5)];\
2111
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2112
}\
2113
\
2114
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2115
    int16_t tmp[SIZE*(SIZE+5)];\
2116
    uint8_t halfH[SIZE*SIZE];\
2117
    uint8_t halfHV[SIZE*SIZE];\
2118
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2119
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2121
}\
2122
\
2123
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2124
    int16_t tmp[SIZE*(SIZE+5)];\
2125
    uint8_t halfH[SIZE*SIZE];\
2126
    uint8_t halfHV[SIZE*SIZE];\
2127
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2128
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2129
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2130
}\
2131
\
2132
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2133
    uint8_t full[SIZE*(SIZE+5)];\
2134
    uint8_t * const full_mid= full + SIZE*2;\
2135
    int16_t tmp[SIZE*(SIZE+5)];\
2136
    uint8_t halfV[SIZE*SIZE];\
2137
    uint8_t halfHV[SIZE*SIZE];\
2138
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2139
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2140
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2142
}\
2143
\
2144
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2145
    uint8_t full[SIZE*(SIZE+5)];\
2146
    uint8_t * const full_mid= full + SIZE*2;\
2147
    int16_t tmp[SIZE*(SIZE+5)];\
2148
    uint8_t halfV[SIZE*SIZE];\
2149
    uint8_t halfHV[SIZE*SIZE];\
2150
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2151
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2152
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2153
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2154
}\
2155

    
2156
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2157
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2158
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2159
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2160
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2161

    
2162
H264_LOWPASS(put_       , op_put, op2_put)
2163
H264_LOWPASS(avg_       , op_avg, op2_avg)
2164
H264_MC(put_, 4)
2165
H264_MC(put_, 8)
2166
H264_MC(put_, 16)
2167
H264_MC(avg_, 4)
2168
H264_MC(avg_, 8)
2169
H264_MC(avg_, 16)
2170

    
2171
#undef op_avg
2172
#undef op_put
2173
#undef op2_avg
2174
#undef op2_put
2175
#endif
2176

    
2177
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2178
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2179
    int i;
2180

    
2181
    for(i=0; i<h; i++){
2182
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2183
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2184
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2185
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2186
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2187
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2188
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2189
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2190
        dst+=dstStride;
2191
        src+=srcStride;        
2192
    }
2193
}
2194

    
2195
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2196
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2197
    int i;
2198

    
2199
    for(i=0; i<w; i++){
2200
        const int src_1= src[ -srcStride];
2201
        const int src0 = src[0          ];
2202
        const int src1 = src[  srcStride];
2203
        const int src2 = src[2*srcStride];
2204
        const int src3 = src[3*srcStride];
2205
        const int src4 = src[4*srcStride];
2206
        const int src5 = src[5*srcStride];
2207
        const int src6 = src[6*srcStride];
2208
        const int src7 = src[7*srcStride];
2209
        const int src8 = src[8*srcStride];
2210
        const int src9 = src[9*srcStride];
2211
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2212
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2213
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2214
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2215
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2216
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2217
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2218
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2219
        src++;
2220
        dst++;
2221
    }
2222
}
2223

    
2224
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2225
    put_pixels8_c(dst, src, stride, 8);
2226
}
2227

    
2228
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2229
    uint8_t half[64];
2230
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2231
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2232
}
2233

    
2234
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2235
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2236
}
2237

    
2238
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2239
    uint8_t half[64];
2240
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2241
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2242
}
2243

    
2244
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2245
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2246
}
2247

    
2248
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2249
    uint8_t halfH[88];
2250
    uint8_t halfV[64];
2251
    uint8_t halfHV[64];
2252
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2253
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2254
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2255
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2256
}
2257
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2258
    uint8_t halfH[88];
2259
    uint8_t halfV[64];
2260
    uint8_t halfHV[64];
2261
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2263
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2264
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2265
}
2266
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2267
    uint8_t halfH[88];
2268
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2269
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2270
}
2271

    
2272
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2273
    int x;
2274
    const int strength= ff_h263_loop_filter_strength[qscale];
2275
    
2276
    for(x=0; x<8; x++){
2277
        int d1, d2, ad1;
2278
        int p0= src[x-2*stride];
2279
        int p1= src[x-1*stride];
2280
        int p2= src[x+0*stride];
2281
        int p3= src[x+1*stride];
2282
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2283

    
2284
        if     (d<-2*strength) d1= 0;
2285
        else if(d<-  strength) d1=-2*strength - d;
2286
        else if(d<   strength) d1= d;
2287
        else if(d< 2*strength) d1= 2*strength - d;
2288
        else                   d1= 0;
2289
        
2290
        p1 += d1;
2291
        p2 -= d1;
2292
        if(p1&256) p1= ~(p1>>31);
2293
        if(p2&256) p2= ~(p2>>31);
2294
        
2295
        src[x-1*stride] = p1;
2296
        src[x+0*stride] = p2;
2297

    
2298
        ad1= ABS(d1)>>1;
2299
        
2300
        d2= clip((p0-p3)/4, -ad1, ad1);
2301
        
2302
        src[x-2*stride] = p0 - d2;
2303
        src[x+  stride] = p3 + d2;
2304
    }
2305
}
2306

    
2307
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2308
    int y;
2309
    const int strength= ff_h263_loop_filter_strength[qscale];
2310
    
2311
    for(y=0; y<8; y++){
2312
        int d1, d2, ad1;
2313
        int p0= src[y*stride-2];
2314
        int p1= src[y*stride-1];
2315
        int p2= src[y*stride+0];
2316
        int p3= src[y*stride+1];
2317
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2318

    
2319
        if     (d<-2*strength) d1= 0;
2320
        else if(d<-  strength) d1=-2*strength - d;
2321
        else if(d<   strength) d1= d;
2322
        else if(d< 2*strength) d1= 2*strength - d;
2323
        else                   d1= 0;
2324
        
2325
        p1 += d1;
2326
        p2 -= d1;
2327
        if(p1&256) p1= ~(p1>>31);
2328
        if(p2&256) p2= ~(p2>>31);
2329
        
2330
        src[y*stride-1] = p1;
2331
        src[y*stride+0] = p2;
2332

    
2333
        ad1= ABS(d1)>>1;
2334
        
2335
        d2= clip((p0-p3)/4, -ad1, ad1);
2336
        
2337
        src[y*stride-2] = p0 - d2;
2338
        src[y*stride+1] = p3 + d2;
2339
    }
2340
}
2341

    
2342
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2343
{
2344
    int s, i;
2345

    
2346
    s = 0;
2347
    for(i=0;i<h;i++) {
2348
        s += abs(pix1[0] - pix2[0]);
2349
        s += abs(pix1[1] - pix2[1]);
2350
        s += abs(pix1[2] - pix2[2]);
2351
        s += abs(pix1[3] - pix2[3]);
2352
        s += abs(pix1[4] - pix2[4]);
2353
        s += abs(pix1[5] - pix2[5]);
2354
        s += abs(pix1[6] - pix2[6]);
2355
        s += abs(pix1[7] - pix2[7]);
2356
        s += abs(pix1[8] - pix2[8]);
2357
        s += abs(pix1[9] - pix2[9]);
2358
        s += abs(pix1[10] - pix2[10]);
2359
        s += abs(pix1[11] - pix2[11]);
2360
        s += abs(pix1[12] - pix2[12]);
2361
        s += abs(pix1[13] - pix2[13]);
2362
        s += abs(pix1[14] - pix2[14]);
2363
        s += abs(pix1[15] - pix2[15]);
2364
        pix1 += line_size;
2365
        pix2 += line_size;
2366
    }
2367
    return s;
2368
}
2369

    
2370
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2371
{
2372
    int s, i;
2373

    
2374
    s = 0;
2375
    for(i=0;i<h;i++) {
2376
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2377
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2378
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2379
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2380
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2381
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2382
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2383
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2384
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2385
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2386
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2387
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2388
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2389
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2390
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2391
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2392
        pix1 += line_size;
2393
        pix2 += line_size;
2394
    }
2395
    return s;
2396
}
2397

    
2398
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2399
{
2400
    int s, i;
2401
    uint8_t *pix3 = pix2 + line_size;
2402

    
2403
    s = 0;
2404
    for(i=0;i<h;i++) {
2405
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2406
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2407
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2408
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2409
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2410
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2411
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2412
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2413
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2414
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2415
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2416
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2417
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2418
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2419
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2420
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2421
        pix1 += line_size;
2422
        pix2 += line_size;
2423
        pix3 += line_size;
2424
    }
2425
    return s;
2426
}
2427

    
2428
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2429
{
2430
    int s, i;
2431
    uint8_t *pix3 = pix2 + line_size;
2432

    
2433
    s = 0;
2434
    for(i=0;i<h;i++) {
2435
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2436
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2437
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2438
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2439
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2440
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2441
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2442
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2443
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2444
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2445
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2446
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2447
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2448
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2449
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2450
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2451
        pix1 += line_size;
2452
        pix2 += line_size;
2453
        pix3 += line_size;
2454
    }
2455
    return s;
2456
}
2457

    
2458
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2459
{
2460
    int s, i;
2461

    
2462
    s = 0;
2463
    for(i=0;i<h;i++) {
2464
        s += abs(pix1[0] - pix2[0]);
2465
        s += abs(pix1[1] - pix2[1]);
2466
        s += abs(pix1[2] - pix2[2]);
2467
        s += abs(pix1[3] - pix2[3]);
2468
        s += abs(pix1[4] - pix2[4]);
2469
        s += abs(pix1[5] - pix2[5]);
2470
        s += abs(pix1[6] - pix2[6]);
2471
        s += abs(pix1[7] - pix2[7]);
2472
        pix1 += line_size;
2473
        pix2 += line_size;
2474
    }
2475
    return s;
2476
}
2477

    
2478
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2479
{
2480
    int s, i;
2481

    
2482
    s = 0;
2483
    for(i=0;i<h;i++) {
2484
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2485
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2486
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2487
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2488
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2489
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2490
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2491
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2492
        pix1 += line_size;
2493
        pix2 += line_size;
2494
    }
2495
    return s;
2496
}
2497

    
2498
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2499
{
2500
    int s, i;
2501
    uint8_t *pix3 = pix2 + line_size;
2502

    
2503
    s = 0;
2504
    for(i=0;i<h;i++) {
2505
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2506
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2507
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2508
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2509
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2510
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2511
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2512
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2513
        pix1 += line_size;
2514
        pix2 += line_size;
2515
        pix3 += line_size;
2516
    }
2517
    return s;
2518
}
2519

    
2520
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2521
{
2522
    int s, i;
2523
    uint8_t *pix3 = pix2 + line_size;
2524

    
2525
    s = 0;
2526
    for(i=0;i<h;i++) {
2527
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2528
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2529
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2530
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2531
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2532
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2533
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2534
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2535
        pix1 += line_size;
2536
        pix2 += line_size;
2537
        pix3 += line_size;
2538
    }
2539
    return s;
2540
}
2541

    
2542
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2543
    int i;
2544
    unsigned int sum=0;
2545

    
2546
    for(i=0; i<8*8; i++){
2547
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2548
        int w= weight[i];
2549
        b>>= RECON_SHIFT;
2550
        assert(-512<b && b<512);
2551

    
2552
        sum += (w*b)*(w*b)>>4;
2553
    }
2554
    return sum>>2;
2555
}
2556

    
2557
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2558
    int i;
2559

    
2560
    for(i=0; i<8*8; i++){
2561
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2562
    }    
2563
}
2564

    
2565
/**
2566
 * permutes an 8x8 block.
2567
 * @param block the block which will be permuted according to the given permutation vector
2568
 * @param permutation the permutation vector
2569
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2570
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2571
 *                  (inverse) permutated to scantable order!
2572
 */
2573
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2574
{
2575
    int i;
2576
    DCTELEM temp[64];
2577
    
2578
    if(last<=0) return;
2579
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2580

    
2581
    for(i=0; i<=last; i++){
2582
        const int j= scantable[i];
2583
        temp[j]= block[j];
2584
        block[j]=0;
2585
    }
2586
    
2587
    for(i=0; i<=last; i++){
2588
        const int j= scantable[i];
2589
        const int perm_j= permutation[j];
2590
        block[perm_j]= temp[j];
2591
    }
2592
}
2593

    
2594
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2595
    return 0;
2596
}
2597

    
2598
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2599
    int i;
2600
    
2601
    memset(cmp, 0, sizeof(void*)*5);
2602
        
2603
    for(i=0; i<5; i++){
2604
        switch(type&0xFF){
2605
        case FF_CMP_SAD:
2606
            cmp[i]= c->sad[i];
2607
            break;
2608
        case FF_CMP_SATD:
2609
            cmp[i]= c->hadamard8_diff[i];
2610
            break;
2611
        case FF_CMP_SSE:
2612
            cmp[i]= c->sse[i];
2613
            break;
2614
        case FF_CMP_DCT:
2615
            cmp[i]= c->dct_sad[i];
2616
            break;
2617
        case FF_CMP_PSNR:
2618
            cmp[i]= c->quant_psnr[i];
2619
            break;
2620
        case FF_CMP_BIT:
2621
            cmp[i]= c->bit[i];
2622
            break;
2623
        case FF_CMP_RD:
2624
            cmp[i]= c->rd[i];
2625
            break;
2626
        case FF_CMP_VSAD:
2627
            cmp[i]= c->vsad[i];
2628
            break;
2629
        case FF_CMP_VSSE:
2630
            cmp[i]= c->vsse[i];
2631
            break;
2632
        case FF_CMP_ZERO:
2633
            cmp[i]= zero_cmp;
2634
            break;
2635
        default:
2636
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2637
        }
2638
    }
2639
}
2640

    
2641
/**
2642
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2643
 */
2644
static void clear_blocks_c(DCTELEM *blocks)
2645
{
2646
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2647
}
2648

    
2649
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2650
    int i;
2651
    for(i=0; i+7<w; i+=8){
2652
        dst[i+0] += src[i+0];
2653
        dst[i+1] += src[i+1];
2654
        dst[i+2] += src[i+2];
2655
        dst[i+3] += src[i+3];
2656
        dst[i+4] += src[i+4];
2657
        dst[i+5] += src[i+5];
2658
        dst[i+6] += src[i+6];
2659
        dst[i+7] += src[i+7];
2660
    }
2661
    for(; i<w; i++)
2662
        dst[i+0] += src[i+0];
2663
}
2664

    
2665
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2666
    int i;
2667
    for(i=0; i+7<w; i+=8){
2668
        dst[i+0] = src1[i+0]-src2[i+0];
2669
        dst[i+1] = src1[i+1]-src2[i+1];
2670
        dst[i+2] = src1[i+2]-src2[i+2];
2671
        dst[i+3] = src1[i+3]-src2[i+3];
2672
        dst[i+4] = src1[i+4]-src2[i+4];
2673
        dst[i+5] = src1[i+5]-src2[i+5];
2674
        dst[i+6] = src1[i+6]-src2[i+6];
2675
        dst[i+7] = src1[i+7]-src2[i+7];
2676
    }
2677
    for(; i<w; i++)
2678
        dst[i+0] = src1[i+0]-src2[i+0];
2679
}
2680

    
2681
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2682
    int i;
2683
    uint8_t l, lt;
2684

    
2685
    l= *left;
2686
    lt= *left_top;
2687

    
2688
    for(i=0; i<w; i++){
2689
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2690
        lt= src1[i];
2691
        l= src2[i];
2692
        dst[i]= l - pred;
2693
    }    
2694

    
2695
    *left= l;
2696
    *left_top= lt;
2697
}
2698

    
2699
#define BUTTERFLY2(o1,o2,i1,i2) \
2700
o1= (i1)+(i2);\
2701
o2= (i1)-(i2);
2702

    
2703
#define BUTTERFLY1(x,y) \
2704
{\
2705
    int a,b;\
2706
    a= x;\
2707
    b= y;\
2708
    x= a+b;\
2709
    y= a-b;\
2710
}
2711

    
2712
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2713

    
2714
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2715
    int i;
2716
    int temp[64];
2717
    int sum=0;
2718
    
2719
    assert(h==8);
2720

    
2721
    for(i=0; i<8; i++){
2722
        //FIXME try pointer walks
2723
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2724
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2725
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2726
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2727
        
2728
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2729
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2730
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2731
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2732
        
2733
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2734
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2735
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2736
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2737
    }
2738

    
2739
    for(i=0; i<8; i++){
2740
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2741
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2742
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2743
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2744
        
2745
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2746
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2747
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2748
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2749

    
2750
        sum += 
2751
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2752
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2753
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2754
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2755
    }
2756
#if 0
2757
static int maxi=0;
2758
if(sum>maxi){
2759
    maxi=sum;
2760
    printf("MAX:%d\n", maxi);
2761
}
2762
#endif
2763
    return sum;
2764
}
2765

    
2766
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2767
    int i;
2768
    int temp[64];
2769
    int sum=0;
2770
    
2771
    assert(h==8);
2772
    
2773
    for(i=0; i<8; i++){
2774
        //FIXME try pointer walks
2775
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2776
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2777
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2778
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2779
        
2780
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2781
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2782
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2783
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2784
        
2785
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2786
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2787
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2788
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2789
    }
2790

    
2791
    for(i=0; i<8; i++){
2792
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2793
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2794
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2795
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2796
        
2797
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2798
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2799
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2800
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2801
    
2802
        sum += 
2803
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2804
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2805
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2806
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2807
    }
2808
    
2809
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2810
    
2811
    return sum;
2812
}
2813

    
2814
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2815
    MpegEncContext * const s= (MpegEncContext *)c;
2816
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2817
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2818
    int sum=0, i;
2819
    
2820
    assert(h==8);
2821

    
2822
    s->dsp.diff_pixels(temp, src1, src2, stride);
2823
    s->dsp.fdct(temp);
2824

    
2825
    for(i=0; i<64; i++)
2826
        sum+= ABS(temp[i]);
2827
        
2828
    return sum;
2829
}
2830

    
2831
void simple_idct(DCTELEM *block); //FIXME
2832

    
2833
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2834
    MpegEncContext * const s= (MpegEncContext *)c;
2835
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2836
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2837
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2838
    int sum=0, i;
2839

    
2840
    assert(h==8);
2841
    s->mb_intra=0;
2842
    
2843
    s->dsp.diff_pixels(temp, src1, src2, stride);
2844
    
2845
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2846
    
2847
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2848
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2849
    simple_idct(temp); //FIXME 
2850
    
2851
    for(i=0; i<64; i++)
2852
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2853
        
2854
    return sum;
2855
}
2856

    
2857
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2858
    MpegEncContext * const s= (MpegEncContext *)c;
2859
    const uint8_t *scantable= s->intra_scantable.permutated;
2860
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2861
    uint64_t __align8 aligned_bak[stride];
2862
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2863
    uint8_t * const bak= (uint8_t*)aligned_bak;
2864
    int i, last, run, bits, level, distoration, start_i;
2865
    const int esc_length= s->ac_esc_length;
2866
    uint8_t * length;
2867
    uint8_t * last_length;
2868
    
2869
    assert(h==8);
2870

    
2871
    for(i=0; i<8; i++){
2872
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2873
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2874
    }
2875

    
2876
    s->dsp.diff_pixels(temp, src1, src2, stride);
2877

    
2878
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2879

    
2880
    bits=0;
2881
    
2882
    if (s->mb_intra) {
2883
        start_i = 1; 
2884
        length     = s->intra_ac_vlc_length;
2885
        last_length= s->intra_ac_vlc_last_length;
2886
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2887
    } else {
2888
        start_i = 0;
2889
        length     = s->inter_ac_vlc_length;
2890
        last_length= s->inter_ac_vlc_last_length;
2891
    }
2892
    
2893
    if(last>=start_i){
2894
        run=0;
2895
        for(i=start_i; i<last; i++){
2896
            int j= scantable[i];
2897
            level= temp[j];
2898
        
2899
            if(level){
2900
                level+=64;
2901
                if((level&(~127)) == 0){
2902
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2903
                }else
2904
                    bits+= esc_length;
2905
                run=0;
2906
            }else
2907
                run++;
2908
        }
2909
        i= scantable[last];
2910
       
2911
        level= temp[i] + 64;
2912

    
2913
        assert(level - 64);
2914
        
2915
        if((level&(~127)) == 0){
2916
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2917
        }else
2918
            bits+= esc_length;
2919
    
2920
    }
2921

    
2922
    if(last>=0){
2923
        if(s->mb_intra)
2924
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2925
        else
2926
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2927
    }
2928
    
2929
    s->dsp.idct_add(bak, stride, temp);
2930
    
2931
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2932

    
2933
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2934
}
2935

    
2936
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2937
    MpegEncContext * const s= (MpegEncContext *)c;
2938
    const uint8_t *scantable= s->intra_scantable.permutated;
2939
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2940
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2941
    int i, last, run, bits, level, start_i;
2942
    const int esc_length= s->ac_esc_length;
2943
    uint8_t * length;
2944
    uint8_t * last_length;
2945

    
2946
    assert(h==8);
2947
    
2948
    s->dsp.diff_pixels(temp, src1, src2, stride);
2949

    
2950
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2951

    
2952
    bits=0;
2953
    
2954
    if (s->mb_intra) {
2955
        start_i = 1; 
2956
        length     = s->intra_ac_vlc_length;
2957
        last_length= s->intra_ac_vlc_last_length;
2958
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2959
    } else {
2960
        start_i = 0;
2961
        length     = s->inter_ac_vlc_length;
2962
        last_length= s->inter_ac_vlc_last_length;
2963
    }
2964
    
2965
    if(last>=start_i){
2966
        run=0;
2967
        for(i=start_i; i<last; i++){
2968
            int j= scantable[i];
2969
            level= temp[j];
2970
        
2971
            if(level){
2972
                level+=64;
2973
                if((level&(~127)) == 0){
2974
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2975
                }else
2976
                    bits+= esc_length;
2977
                run=0;
2978
            }else
2979
                run++;
2980
        }
2981
        i= scantable[last];
2982
                
2983
        level= temp[i] + 64;
2984
        
2985
        assert(level - 64);
2986
        
2987
        if((level&(~127)) == 0){
2988
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2989
        }else
2990
            bits+= esc_length;
2991
    }
2992

    
2993
    return bits;
2994
}
2995

    
2996
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2997
    int score=0;
2998
    int x,y;
2999
    
3000
    for(y=1; y<h; y++){
3001
        for(x=0; x<16; x+=4){
3002
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride]) 
3003
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3004
        }
3005
        s+= stride;
3006
    }
3007
    
3008
    return score;
3009
}
3010

    
3011
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3012
    int score=0;
3013
    int x,y;
3014
    
3015
    for(y=1; y<h; y++){
3016
        for(x=0; x<16; x++){
3017
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3018
        }
3019
        s1+= stride;
3020
        s2+= stride;
3021
    }
3022
    
3023
    return score;
3024
}
3025

    
3026
#define SQ(a) ((a)*(a))
3027
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3028
    int score=0;
3029
    int x,y;
3030
    
3031
    for(y=1; y<h; y++){
3032
        for(x=0; x<16; x+=4){
3033
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
3034
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3035
        }
3036
        s+= stride;
3037
    }
3038
    
3039
    return score;
3040
}
3041

    
3042
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3043
    int score=0;
3044
    int x,y;
3045
    
3046
    for(y=1; y<h; y++){
3047
        for(x=0; x<16; x++){
3048
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3049
        }
3050
        s1+= stride;
3051
        s2+= stride;
3052
    }
3053
    
3054
    return score;
3055
}
3056

    
3057
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3058
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3059
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3060
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3061
WARPER8_16_SQ(rd8x8_c, rd16_c)
3062
WARPER8_16_SQ(bit8x8_c, bit16_c)
3063

    
3064
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3065
 converted */
3066
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3067
{
3068
    j_rev_dct (block);
3069
    put_pixels_clamped_c(block, dest, line_size);
3070
}
3071
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3072
{
3073
    j_rev_dct (block);
3074
    add_pixels_clamped_c(block, dest, line_size);
3075
}
3076

    
3077
/* init static data */
3078
void dsputil_static_init(void)
3079
{
3080
    int i;
3081

    
3082
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3083
    for(i=0;i<MAX_NEG_CROP;i++) {
3084
        cropTbl[i] = 0;
3085
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3086
    }
3087
    
3088
    for(i=0;i<512;i++) {
3089
        squareTbl[i] = (i - 256) * (i - 256);
3090
    }
3091
    
3092
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3093
}
3094

    
3095

    
3096
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3097
{
3098
    int i;
3099

    
3100
#ifdef CONFIG_ENCODERS
3101
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3102
        c->fdct = fdct_ifast;
3103
        c->fdct248 = fdct_ifast248;
3104
    } 
3105
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3106
        c->fdct = ff_faandct;
3107
        c->fdct248 = ff_faandct248; 
3108
    } 
3109
    else {
3110
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3111
        c->fdct248 = ff_fdct248_islow;
3112
    }
3113
#endif //CONFIG_ENCODERS
3114

    
3115
    if(avctx->idct_algo==FF_IDCT_INT){
3116
        c->idct_put= ff_jref_idct_put;
3117
        c->idct_add= ff_jref_idct_add;
3118
        c->idct    = j_rev_dct;
3119
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3120
    }else{ //accurate/default
3121
        c->idct_put= simple_idct_put;
3122
        c->idct_add= simple_idct_add;
3123
        c->idct    = simple_idct;
3124
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3125
    }
3126

    
3127
    /* VP3 DSP support */
3128
    c->vp3_dsp_init = vp3_dsp_init_c;
3129
    c->vp3_idct_put = vp3_idct_put_c;
3130
    c->vp3_idct_add = vp3_idct_add_c;
3131

    
3132
    c->get_pixels = get_pixels_c;
3133
    c->diff_pixels = diff_pixels_c;
3134
    c->put_pixels_clamped = put_pixels_clamped_c;
3135
    c->add_pixels_clamped = add_pixels_clamped_c;
3136
    c->gmc1 = gmc1_c;
3137
    c->gmc = gmc_c;
3138
    c->clear_blocks = clear_blocks_c;
3139
    c->pix_sum = pix_sum_c;
3140
    c->pix_norm1 = pix_norm1_c;
3141

    
3142
    /* TODO [0] 16  [1] 8 */
3143
    c->pix_abs[0][0] = pix_abs16_c;
3144
    c->pix_abs[0][1] = pix_abs16_x2_c;
3145
    c->pix_abs[0][2] = pix_abs16_y2_c;
3146
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3147
    c->pix_abs[1][0] = pix_abs8_c;
3148
    c->pix_abs[1][1] = pix_abs8_x2_c;
3149
    c->pix_abs[1][2] = pix_abs8_y2_c;
3150
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3151

    
3152
#define dspfunc(PFX, IDX, NUM) \
3153
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3154
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3155
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3156
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3157

    
3158
    dspfunc(put, 0, 16);
3159
    dspfunc(put_no_rnd, 0, 16);
3160
    dspfunc(put, 1, 8);
3161
    dspfunc(put_no_rnd, 1, 8);
3162
    dspfunc(put, 2, 4);
3163
    dspfunc(put, 3, 2);
3164

    
3165
    dspfunc(avg, 0, 16);
3166
    dspfunc(avg_no_rnd, 0, 16);
3167
    dspfunc(avg, 1, 8);
3168
    dspfunc(avg_no_rnd, 1, 8);
3169
    dspfunc(avg, 2, 4);
3170
    dspfunc(avg, 3, 2);
3171
#undef dspfunc
3172

    
3173
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3174
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3175

    
3176
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3177
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3178
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3179
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3180
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3181
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3182
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3183
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3184
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3185

    
3186
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3187
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3188
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3189
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3190
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3191
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3192
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3193
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3194
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3195

    
3196
#define dspfunc(PFX, IDX, NUM) \
3197
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3198
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3199
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3200
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3201
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3202
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3203
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3204
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3205
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3206
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3207
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3208
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3209
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3210
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3211
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3212
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3213

    
3214
    dspfunc(put_qpel, 0, 16);
3215
    dspfunc(put_no_rnd_qpel, 0, 16);
3216

    
3217
    dspfunc(avg_qpel, 0, 16);
3218
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3219

    
3220
    dspfunc(put_qpel, 1, 8);
3221
    dspfunc(put_no_rnd_qpel, 1, 8);
3222

    
3223
    dspfunc(avg_qpel, 1, 8);
3224
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3225

    
3226
    dspfunc(put_h264_qpel, 0, 16);
3227
    dspfunc(put_h264_qpel, 1, 8);
3228
    dspfunc(put_h264_qpel, 2, 4);
3229
    dspfunc(avg_h264_qpel, 0, 16);
3230
    dspfunc(avg_h264_qpel, 1, 8);
3231
    dspfunc(avg_h264_qpel, 2, 4);
3232

    
3233
#undef dspfunc
3234
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3235
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3236
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3237
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3238
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3239
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3240

    
3241
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3242
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3243
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3244
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3245
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3246
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3247
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3248
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3249
        
3250
#define SET_CMP_FUNC(name) \
3251
    c->name[0]= name ## 16_c;\
3252
    c->name[1]= name ## 8x8_c;
3253
    
3254
    SET_CMP_FUNC(hadamard8_diff)
3255
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3256
    SET_CMP_FUNC(dct_sad)
3257
    c->sad[0]= pix_abs16_c;
3258
    c->sad[1]= pix_abs8_c;
3259
    c->sse[0]= sse16_c;
3260
    c->sse[1]= sse8_c;
3261
    SET_CMP_FUNC(quant_psnr)
3262
    SET_CMP_FUNC(rd)
3263
    SET_CMP_FUNC(bit)
3264
    c->vsad[0]= vsad16_c;
3265
    c->vsad[4]= vsad_intra16_c;
3266
    c->vsse[0]= vsse16_c;
3267
    c->vsse[4]= vsse_intra16_c;
3268
        
3269
    c->add_bytes= add_bytes_c;
3270
    c->diff_bytes= diff_bytes_c;
3271
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3272
    c->bswap_buf= bswap_buf;
3273
    
3274
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3275
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3276
    
3277
    c->try_8x8basis= try_8x8basis_c;
3278
    c->add_8x8basis= add_8x8basis_c;
3279

    
3280
#ifdef HAVE_MMX
3281
    dsputil_init_mmx(c, avctx);
3282
#endif
3283
#ifdef ARCH_ARMV4L
3284
    dsputil_init_armv4l(c, avctx);
3285
#endif
3286
#ifdef HAVE_MLIB
3287
    dsputil_init_mlib(c, avctx);
3288
#endif
3289
#ifdef ARCH_ALPHA
3290
    dsputil_init_alpha(c, avctx);
3291
#endif
3292
#ifdef ARCH_POWERPC
3293
    dsputil_init_ppc(c, avctx);
3294
#endif
3295
#ifdef HAVE_MMI
3296
    dsputil_init_mmi(c, avctx);
3297
#endif
3298
#ifdef ARCH_SH4
3299
    dsputil_init_sh4(c,avctx);
3300
#endif
3301

    
3302
    switch(c->idct_permutation_type){
3303
    case FF_NO_IDCT_PERM:
3304
        for(i=0; i<64; i++)
3305
            c->idct_permutation[i]= i;
3306
        break;
3307
    case FF_LIBMPEG2_IDCT_PERM:
3308
        for(i=0; i<64; i++)
3309
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3310
        break;
3311
    case FF_SIMPLE_IDCT_PERM:
3312
        for(i=0; i<64; i++)
3313
            c->idct_permutation[i]= simple_mmx_permutation[i];
3314
        break;
3315
    case FF_TRANSPOSE_IDCT_PERM:
3316
        for(i=0; i<64; i++)
3317
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3318
        break;
3319
    default:
3320
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3321
    }
3322
}
3323