Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ d4c5d2ad

History | View | Annotate | Download (124 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
 
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27
 
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35
uint32_t squareTbl[512];
36

    
37
const uint8_t ff_zigzag_direct[64] = {
38
    0,   1,  8, 16,  9,  2,  3, 10,
39
    17, 24, 32, 25, 18, 11,  4,  5,
40
    12, 19, 26, 33, 40, 48, 41, 34,
41
    27, 20, 13,  6,  7, 14, 21, 28,
42
    35, 42, 49, 56, 57, 50, 43, 36,
43
    29, 22, 15, 23, 30, 37, 44, 51,
44
    58, 59, 52, 45, 38, 31, 39, 46,
45
    53, 60, 61, 54, 47, 55, 62, 63
46
};
47

    
48
/* Specific zigzag scan for 248 idct. NOTE that unlike the
49
   specification, we interleave the fields */
50
const uint8_t ff_zigzag248_direct[64] = {
51
     0,  8,  1,  9, 16, 24,  2, 10,
52
    17, 25, 32, 40, 48, 56, 33, 41,
53
    18, 26,  3, 11,  4, 12, 19, 27,
54
    34, 42, 49, 57, 50, 58, 35, 43,
55
    20, 28,  5, 13,  6, 14, 21, 29,
56
    36, 44, 51, 59, 52, 60, 37, 45,
57
    22, 30,  7, 15, 23, 31, 38, 46,
58
    53, 61, 54, 62, 39, 47, 55, 63,
59
};
60

    
61
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62
uint16_t __align8 inv_zigzag_direct16[64];
63

    
64
const uint8_t ff_alternate_horizontal_scan[64] = {
65
    0,  1,   2,  3,  8,  9, 16, 17, 
66
    10, 11,  4,  5,  6,  7, 15, 14,
67
    13, 12, 19, 18, 24, 25, 32, 33, 
68
    26, 27, 20, 21, 22, 23, 28, 29,
69
    30, 31, 34, 35, 40, 41, 48, 49, 
70
    42, 43, 36, 37, 38, 39, 44, 45,
71
    46, 47, 50, 51, 56, 57, 58, 59, 
72
    52, 53, 54, 55, 60, 61, 62, 63,
73
};
74

    
75
const uint8_t ff_alternate_vertical_scan[64] = {
76
    0,  8,  16, 24,  1,  9,  2, 10, 
77
    17, 25, 32, 40, 48, 56, 57, 49,
78
    41, 33, 26, 18,  3, 11,  4, 12, 
79
    19, 27, 34, 42, 50, 58, 35, 43,
80
    51, 59, 20, 28,  5, 13,  6, 14, 
81
    21, 29, 36, 44, 52, 60, 37, 45,
82
    53, 61, 22, 30,  7, 15, 23, 31, 
83
    38, 46, 54, 62, 39, 47, 55, 63,
84
};
85

    
86
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87
const uint32_t inverse[256]={
88
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
89
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
90
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
91
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
92
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
93
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
94
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
95
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
96
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
97
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
98
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
99
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
100
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
101
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
102
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
103
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
104
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
105
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
106
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
107
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
108
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
109
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
110
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
111
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
112
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
113
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
114
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
115
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
116
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
117
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
118
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
119
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120
};
121

    
122
/* Input permutation for the simple_idct_mmx */
123
static const uint8_t simple_mmx_permutation[64]={
124
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
125
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
126
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
127
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
128
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
129
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
130
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
131
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132
};
133

    
134
static int pix_sum_c(uint8_t * pix, int line_size)
135
{
136
    int s, i, j;
137

    
138
    s = 0;
139
    for (i = 0; i < 16; i++) {
140
        for (j = 0; j < 16; j += 8) {
141
            s += pix[0];
142
            s += pix[1];
143
            s += pix[2];
144
            s += pix[3];
145
            s += pix[4];
146
            s += pix[5];
147
            s += pix[6];
148
            s += pix[7];
149
            pix += 8;
150
        }
151
        pix += line_size - 16;
152
    }
153
    return s;
154
}
155

    
156
static int pix_norm1_c(uint8_t * pix, int line_size)
157
{
158
    int s, i, j;
159
    uint32_t *sq = squareTbl + 256;
160

    
161
    s = 0;
162
    for (i = 0; i < 16; i++) {
163
        for (j = 0; j < 16; j += 8) {
164
#if 0
165
            s += sq[pix[0]];
166
            s += sq[pix[1]];
167
            s += sq[pix[2]];
168
            s += sq[pix[3]];
169
            s += sq[pix[4]];
170
            s += sq[pix[5]];
171
            s += sq[pix[6]];
172
            s += sq[pix[7]];
173
#else
174
#if LONG_MAX > 2147483647
175
            register uint64_t x=*(uint64_t*)pix;
176
            s += sq[x&0xff];
177
            s += sq[(x>>8)&0xff];
178
            s += sq[(x>>16)&0xff];
179
            s += sq[(x>>24)&0xff];
180
            s += sq[(x>>32)&0xff];
181
            s += sq[(x>>40)&0xff];
182
            s += sq[(x>>48)&0xff];
183
            s += sq[(x>>56)&0xff];
184
#else
185
            register uint32_t x=*(uint32_t*)pix;
186
            s += sq[x&0xff];
187
            s += sq[(x>>8)&0xff];
188
            s += sq[(x>>16)&0xff];
189
            s += sq[(x>>24)&0xff];
190
            x=*(uint32_t*)(pix+4);
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
#endif
196
#endif
197
            pix += 8;
198
        }
199
        pix += line_size - 16;
200
    }
201
    return s;
202
}
203

    
204
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205
    int i;
206
    
207
    for(i=0; i+8<=w; i+=8){
208
        dst[i+0]= bswap_32(src[i+0]);
209
        dst[i+1]= bswap_32(src[i+1]);
210
        dst[i+2]= bswap_32(src[i+2]);
211
        dst[i+3]= bswap_32(src[i+3]);
212
        dst[i+4]= bswap_32(src[i+4]);
213
        dst[i+5]= bswap_32(src[i+5]);
214
        dst[i+6]= bswap_32(src[i+6]);
215
        dst[i+7]= bswap_32(src[i+7]);
216
    }
217
    for(;i<w; i++){
218
        dst[i+0]= bswap_32(src[i+0]);
219
    }
220
}
221

    
222
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223
{
224
    int s, i;
225
    uint32_t *sq = squareTbl + 256;
226

    
227
    s = 0;
228
    for (i = 0; i < h; i++) {
229
        s += sq[pix1[0] - pix2[0]];
230
        s += sq[pix1[1] - pix2[1]];
231
        s += sq[pix1[2] - pix2[2]];
232
        s += sq[pix1[3] - pix2[3]];
233
        s += sq[pix1[4] - pix2[4]];
234
        s += sq[pix1[5] - pix2[5]];
235
        s += sq[pix1[6] - pix2[6]];
236
        s += sq[pix1[7] - pix2[7]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242

    
243
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[ 0] - pix2[ 0]];
251
        s += sq[pix1[ 1] - pix2[ 1]];
252
        s += sq[pix1[ 2] - pix2[ 2]];
253
        s += sq[pix1[ 3] - pix2[ 3]];
254
        s += sq[pix1[ 4] - pix2[ 4]];
255
        s += sq[pix1[ 5] - pix2[ 5]];
256
        s += sq[pix1[ 6] - pix2[ 6]];
257
        s += sq[pix1[ 7] - pix2[ 7]];
258
        s += sq[pix1[ 8] - pix2[ 8]];
259
        s += sq[pix1[ 9] - pix2[ 9]];
260
        s += sq[pix1[10] - pix2[10]];
261
        s += sq[pix1[11] - pix2[11]];
262
        s += sq[pix1[12] - pix2[12]];
263
        s += sq[pix1[13] - pix2[13]];
264
        s += sq[pix1[14] - pix2[14]];
265
        s += sq[pix1[15] - pix2[15]];
266

    
267
        pix1 += line_size;
268
        pix2 += line_size;
269
    }
270
    return s;
271
}
272

    
273
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274
{
275
    int i;
276

    
277
    /* read the pixels */
278
    for(i=0;i<8;i++) {
279
        block[0] = pixels[0];
280
        block[1] = pixels[1];
281
        block[2] = pixels[2];
282
        block[3] = pixels[3];
283
        block[4] = pixels[4];
284
        block[5] = pixels[5];
285
        block[6] = pixels[6];
286
        block[7] = pixels[7];
287
        pixels += line_size;
288
        block += 8;
289
    }
290
}
291

    
292
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293
                          const uint8_t *s2, int stride){
294
    int i;
295

    
296
    /* read the pixels */
297
    for(i=0;i<8;i++) {
298
        block[0] = s1[0] - s2[0];
299
        block[1] = s1[1] - s2[1];
300
        block[2] = s1[2] - s2[2];
301
        block[3] = s1[3] - s2[3];
302
        block[4] = s1[4] - s2[4];
303
        block[5] = s1[5] - s2[5];
304
        block[6] = s1[6] - s2[6];
305
        block[7] = s1[7] - s2[7];
306
        s1 += stride;
307
        s2 += stride;
308
        block += 8;
309
    }
310
}
311

    
312

    
313
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314
                                 int line_size)
315
{
316
    int i;
317
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
    
319
    /* read the pixels */
320
    for(i=0;i<8;i++) {
321
        pixels[0] = cm[block[0]];
322
        pixels[1] = cm[block[1]];
323
        pixels[2] = cm[block[2]];
324
        pixels[3] = cm[block[3]];
325
        pixels[4] = cm[block[4]];
326
        pixels[5] = cm[block[5]];
327
        pixels[6] = cm[block[6]];
328
        pixels[7] = cm[block[7]];
329

    
330
        pixels += line_size;
331
        block += 8;
332
    }
333
}
334

    
335
static void put_signed_pixels_clamped_c(const DCTELEM *block, 
336
                                        uint8_t *restrict pixels,
337
                                        int line_size)
338
{
339
    int i, j;
340

    
341
    for (i = 0; i < 8; i++) {
342
        for (j = 0; j < 8; j++) {
343
            if (*block < -128)
344
                *pixels = 0;
345
            else if (*block > 127)
346
                *pixels = 255;
347
            else
348
                *pixels = (uint8_t)(*block + 128);
349
            block++;
350
            pixels++;
351
        }
352
        pixels += (line_size - 8);
353
    }
354
}
355

    
356
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
357
                          int line_size)
358
{
359
    int i;
360
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
361
    
362
    /* read the pixels */
363
    for(i=0;i<8;i++) {
364
        pixels[0] = cm[pixels[0] + block[0]];
365
        pixels[1] = cm[pixels[1] + block[1]];
366
        pixels[2] = cm[pixels[2] + block[2]];
367
        pixels[3] = cm[pixels[3] + block[3]];
368
        pixels[4] = cm[pixels[4] + block[4]];
369
        pixels[5] = cm[pixels[5] + block[5]];
370
        pixels[6] = cm[pixels[6] + block[6]];
371
        pixels[7] = cm[pixels[7] + block[7]];
372
        pixels += line_size;
373
        block += 8;
374
    }
375
}
376
#if 0
377

378
#define PIXOP2(OPNAME, OP) \
379
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380
{\
381
    int i;\
382
    for(i=0; i<h; i++){\
383
        OP(*((uint64_t*)block), LD64(pixels));\
384
        pixels+=line_size;\
385
        block +=line_size;\
386
    }\
387
}\
388
\
389
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
390
{\
391
    int i;\
392
    for(i=0; i<h; i++){\
393
        const uint64_t a= LD64(pixels  );\
394
        const uint64_t b= LD64(pixels+1);\
395
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
396
        pixels+=line_size;\
397
        block +=line_size;\
398
    }\
399
}\
400
\
401
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
402
{\
403
    int i;\
404
    for(i=0; i<h; i++){\
405
        const uint64_t a= LD64(pixels  );\
406
        const uint64_t b= LD64(pixels+1);\
407
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
408
        pixels+=line_size;\
409
        block +=line_size;\
410
    }\
411
}\
412
\
413
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
414
{\
415
    int i;\
416
    for(i=0; i<h; i++){\
417
        const uint64_t a= LD64(pixels          );\
418
        const uint64_t b= LD64(pixels+line_size);\
419
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
420
        pixels+=line_size;\
421
        block +=line_size;\
422
    }\
423
}\
424
\
425
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
426
{\
427
    int i;\
428
    for(i=0; i<h; i++){\
429
        const uint64_t a= LD64(pixels          );\
430
        const uint64_t b= LD64(pixels+line_size);\
431
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
432
        pixels+=line_size;\
433
        block +=line_size;\
434
    }\
435
}\
436
\
437
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
438
{\
439
        int i;\
440
        const uint64_t a= LD64(pixels  );\
441
        const uint64_t b= LD64(pixels+1);\
442
        uint64_t l0=  (a&0x0303030303030303ULL)\
443
                    + (b&0x0303030303030303ULL)\
444
                    + 0x0202020202020202ULL;\
445
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447
        uint64_t l1,h1;\
448
\
449
        pixels+=line_size;\
450
        for(i=0; i<h; i+=2){\
451
            uint64_t a= LD64(pixels  );\
452
            uint64_t b= LD64(pixels+1);\
453
            l1=  (a&0x0303030303030303ULL)\
454
               + (b&0x0303030303030303ULL);\
455
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
456
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
457
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
458
            pixels+=line_size;\
459
            block +=line_size;\
460
            a= LD64(pixels  );\
461
            b= LD64(pixels+1);\
462
            l0=  (a&0x0303030303030303ULL)\
463
               + (b&0x0303030303030303ULL)\
464
               + 0x0202020202020202ULL;\
465
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
466
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
467
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
468
            pixels+=line_size;\
469
            block +=line_size;\
470
        }\
471
}\
472
\
473
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
474
{\
475
        int i;\
476
        const uint64_t a= LD64(pixels  );\
477
        const uint64_t b= LD64(pixels+1);\
478
        uint64_t l0=  (a&0x0303030303030303ULL)\
479
                    + (b&0x0303030303030303ULL)\
480
                    + 0x0101010101010101ULL;\
481
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483
        uint64_t l1,h1;\
484
\
485
        pixels+=line_size;\
486
        for(i=0; i<h; i+=2){\
487
            uint64_t a= LD64(pixels  );\
488
            uint64_t b= LD64(pixels+1);\
489
            l1=  (a&0x0303030303030303ULL)\
490
               + (b&0x0303030303030303ULL);\
491
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
492
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
493
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
494
            pixels+=line_size;\
495
            block +=line_size;\
496
            a= LD64(pixels  );\
497
            b= LD64(pixels+1);\
498
            l0=  (a&0x0303030303030303ULL)\
499
               + (b&0x0303030303030303ULL)\
500
               + 0x0101010101010101ULL;\
501
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
502
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
503
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
504
            pixels+=line_size;\
505
            block +=line_size;\
506
        }\
507
}\
508
\
509
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
510
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
511
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
512
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
513
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
514
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
515
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
516

517
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
518
#else // 64 bit variant
519

    
520
#define PIXOP2(OPNAME, OP) \
521
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
522
    int i;\
523
    for(i=0; i<h; i++){\
524
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
525
        pixels+=line_size;\
526
        block +=line_size;\
527
    }\
528
}\
529
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
530
    int i;\
531
    for(i=0; i<h; i++){\
532
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
533
        pixels+=line_size;\
534
        block +=line_size;\
535
    }\
536
}\
537
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
538
    int i;\
539
    for(i=0; i<h; i++){\
540
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
541
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
542
        pixels+=line_size;\
543
        block +=line_size;\
544
    }\
545
}\
546
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
547
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
548
}\
549
\
550
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
551
                                                int src_stride1, int src_stride2, int h){\
552
    int i;\
553
    for(i=0; i<h; i++){\
554
        uint32_t a,b;\
555
        a= LD32(&src1[i*src_stride1  ]);\
556
        b= LD32(&src2[i*src_stride2  ]);\
557
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
558
        a= LD32(&src1[i*src_stride1+4]);\
559
        b= LD32(&src2[i*src_stride2+4]);\
560
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
561
    }\
562
}\
563
\
564
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
565
                                                int src_stride1, int src_stride2, int h){\
566
    int i;\
567
    for(i=0; i<h; i++){\
568
        uint32_t a,b;\
569
        a= LD32(&src1[i*src_stride1  ]);\
570
        b= LD32(&src2[i*src_stride2  ]);\
571
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
572
        a= LD32(&src1[i*src_stride1+4]);\
573
        b= LD32(&src2[i*src_stride2+4]);\
574
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
575
    }\
576
}\
577
\
578
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579
                                                int src_stride1, int src_stride2, int h){\
580
    int i;\
581
    for(i=0; i<h; i++){\
582
        uint32_t a,b;\
583
        a= LD32(&src1[i*src_stride1  ]);\
584
        b= LD32(&src2[i*src_stride2  ]);\
585
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
586
    }\
587
}\
588
\
589
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
590
                                                int src_stride1, int src_stride2, int h){\
591
    int i;\
592
    for(i=0; i<h; i++){\
593
        uint32_t a,b;\
594
        a= LD16(&src1[i*src_stride1  ]);\
595
        b= LD16(&src2[i*src_stride2  ]);\
596
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
597
    }\
598
}\
599
\
600
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
601
                                                int src_stride1, int src_stride2, int h){\
602
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
603
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
604
}\
605
\
606
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
607
                                                int src_stride1, int src_stride2, int h){\
608
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
609
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
610
}\
611
\
612
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
613
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
614
}\
615
\
616
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
617
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
618
}\
619
\
620
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
621
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
622
}\
623
\
624
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
625
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
626
}\
627
\
628
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
630
    int i;\
631
    for(i=0; i<h; i++){\
632
        uint32_t a, b, c, d, l0, l1, h0, h1;\
633
        a= LD32(&src1[i*src_stride1]);\
634
        b= LD32(&src2[i*src_stride2]);\
635
        c= LD32(&src3[i*src_stride3]);\
636
        d= LD32(&src4[i*src_stride4]);\
637
        l0=  (a&0x03030303UL)\
638
           + (b&0x03030303UL)\
639
           + 0x02020202UL;\
640
        h0= ((a&0xFCFCFCFCUL)>>2)\
641
          + ((b&0xFCFCFCFCUL)>>2);\
642
        l1=  (c&0x03030303UL)\
643
           + (d&0x03030303UL);\
644
        h1= ((c&0xFCFCFCFCUL)>>2)\
645
          + ((d&0xFCFCFCFCUL)>>2);\
646
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647
        a= LD32(&src1[i*src_stride1+4]);\
648
        b= LD32(&src2[i*src_stride2+4]);\
649
        c= LD32(&src3[i*src_stride3+4]);\
650
        d= LD32(&src4[i*src_stride4+4]);\
651
        l0=  (a&0x03030303UL)\
652
           + (b&0x03030303UL)\
653
           + 0x02020202UL;\
654
        h0= ((a&0xFCFCFCFCUL)>>2)\
655
          + ((b&0xFCFCFCFCUL)>>2);\
656
        l1=  (c&0x03030303UL)\
657
           + (d&0x03030303UL);\
658
        h1= ((c&0xFCFCFCFCUL)>>2)\
659
          + ((d&0xFCFCFCFCUL)>>2);\
660
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661
    }\
662
}\
663
\
664
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
665
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
666
}\
667
\
668
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
669
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
670
}\
671
\
672
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
674
}\
675
\
676
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
677
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
678
}\
679
\
680
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682
    int i;\
683
    for(i=0; i<h; i++){\
684
        uint32_t a, b, c, d, l0, l1, h0, h1;\
685
        a= LD32(&src1[i*src_stride1]);\
686
        b= LD32(&src2[i*src_stride2]);\
687
        c= LD32(&src3[i*src_stride3]);\
688
        d= LD32(&src4[i*src_stride4]);\
689
        l0=  (a&0x03030303UL)\
690
           + (b&0x03030303UL)\
691
           + 0x01010101UL;\
692
        h0= ((a&0xFCFCFCFCUL)>>2)\
693
          + ((b&0xFCFCFCFCUL)>>2);\
694
        l1=  (c&0x03030303UL)\
695
           + (d&0x03030303UL);\
696
        h1= ((c&0xFCFCFCFCUL)>>2)\
697
          + ((d&0xFCFCFCFCUL)>>2);\
698
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
699
        a= LD32(&src1[i*src_stride1+4]);\
700
        b= LD32(&src2[i*src_stride2+4]);\
701
        c= LD32(&src3[i*src_stride3+4]);\
702
        d= LD32(&src4[i*src_stride4+4]);\
703
        l0=  (a&0x03030303UL)\
704
           + (b&0x03030303UL)\
705
           + 0x01010101UL;\
706
        h0= ((a&0xFCFCFCFCUL)>>2)\
707
          + ((b&0xFCFCFCFCUL)>>2);\
708
        l1=  (c&0x03030303UL)\
709
           + (d&0x03030303UL);\
710
        h1= ((c&0xFCFCFCFCUL)>>2)\
711
          + ((d&0xFCFCFCFCUL)>>2);\
712
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
713
    }\
714
}\
715
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
716
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
717
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
718
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
719
}\
720
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
721
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
722
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
723
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
724
}\
725
\
726
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727
{\
728
        int i, a0, b0, a1, b1;\
729
        a0= pixels[0];\
730
        b0= pixels[1] + 2;\
731
        a0 += b0;\
732
        b0 += pixels[2];\
733
\
734
        pixels+=line_size;\
735
        for(i=0; i<h; i+=2){\
736
            a1= pixels[0];\
737
            b1= pixels[1];\
738
            a1 += b1;\
739
            b1 += pixels[2];\
740
\
741
            block[0]= (a1+a0)>>2; /* FIXME non put */\
742
            block[1]= (b1+b0)>>2;\
743
\
744
            pixels+=line_size;\
745
            block +=line_size;\
746
\
747
            a0= pixels[0];\
748
            b0= pixels[1] + 2;\
749
            a0 += b0;\
750
            b0 += pixels[2];\
751
\
752
            block[0]= (a1+a0)>>2;\
753
            block[1]= (b1+b0)>>2;\
754
            pixels+=line_size;\
755
            block +=line_size;\
756
        }\
757
}\
758
\
759
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760
{\
761
        int i;\
762
        const uint32_t a= LD32(pixels  );\
763
        const uint32_t b= LD32(pixels+1);\
764
        uint32_t l0=  (a&0x03030303UL)\
765
                    + (b&0x03030303UL)\
766
                    + 0x02020202UL;\
767
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
768
                   + ((b&0xFCFCFCFCUL)>>2);\
769
        uint32_t l1,h1;\
770
\
771
        pixels+=line_size;\
772
        for(i=0; i<h; i+=2){\
773
            uint32_t a= LD32(pixels  );\
774
            uint32_t b= LD32(pixels+1);\
775
            l1=  (a&0x03030303UL)\
776
               + (b&0x03030303UL);\
777
            h1= ((a&0xFCFCFCFCUL)>>2)\
778
              + ((b&0xFCFCFCFCUL)>>2);\
779
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
780
            pixels+=line_size;\
781
            block +=line_size;\
782
            a= LD32(pixels  );\
783
            b= LD32(pixels+1);\
784
            l0=  (a&0x03030303UL)\
785
               + (b&0x03030303UL)\
786
               + 0x02020202UL;\
787
            h0= ((a&0xFCFCFCFCUL)>>2)\
788
              + ((b&0xFCFCFCFCUL)>>2);\
789
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
790
            pixels+=line_size;\
791
            block +=line_size;\
792
        }\
793
}\
794
\
795
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
796
{\
797
    int j;\
798
    for(j=0; j<2; j++){\
799
        int i;\
800
        const uint32_t a= LD32(pixels  );\
801
        const uint32_t b= LD32(pixels+1);\
802
        uint32_t l0=  (a&0x03030303UL)\
803
                    + (b&0x03030303UL)\
804
                    + 0x02020202UL;\
805
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
806
                   + ((b&0xFCFCFCFCUL)>>2);\
807
        uint32_t l1,h1;\
808
\
809
        pixels+=line_size;\
810
        for(i=0; i<h; i+=2){\
811
            uint32_t a= LD32(pixels  );\
812
            uint32_t b= LD32(pixels+1);\
813
            l1=  (a&0x03030303UL)\
814
               + (b&0x03030303UL);\
815
            h1= ((a&0xFCFCFCFCUL)>>2)\
816
              + ((b&0xFCFCFCFCUL)>>2);\
817
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
818
            pixels+=line_size;\
819
            block +=line_size;\
820
            a= LD32(pixels  );\
821
            b= LD32(pixels+1);\
822
            l0=  (a&0x03030303UL)\
823
               + (b&0x03030303UL)\
824
               + 0x02020202UL;\
825
            h0= ((a&0xFCFCFCFCUL)>>2)\
826
              + ((b&0xFCFCFCFCUL)>>2);\
827
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828
            pixels+=line_size;\
829
            block +=line_size;\
830
        }\
831
        pixels+=4-line_size*(h+1);\
832
        block +=4-line_size*h;\
833
    }\
834
}\
835
\
836
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
837
{\
838
    int j;\
839
    for(j=0; j<2; j++){\
840
        int i;\
841
        const uint32_t a= LD32(pixels  );\
842
        const uint32_t b= LD32(pixels+1);\
843
        uint32_t l0=  (a&0x03030303UL)\
844
                    + (b&0x03030303UL)\
845
                    + 0x01010101UL;\
846
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
847
                   + ((b&0xFCFCFCFCUL)>>2);\
848
        uint32_t l1,h1;\
849
\
850
        pixels+=line_size;\
851
        for(i=0; i<h; i+=2){\
852
            uint32_t a= LD32(pixels  );\
853
            uint32_t b= LD32(pixels+1);\
854
            l1=  (a&0x03030303UL)\
855
               + (b&0x03030303UL);\
856
            h1= ((a&0xFCFCFCFCUL)>>2)\
857
              + ((b&0xFCFCFCFCUL)>>2);\
858
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
859
            pixels+=line_size;\
860
            block +=line_size;\
861
            a= LD32(pixels  );\
862
            b= LD32(pixels+1);\
863
            l0=  (a&0x03030303UL)\
864
               + (b&0x03030303UL)\
865
               + 0x01010101UL;\
866
            h0= ((a&0xFCFCFCFCUL)>>2)\
867
              + ((b&0xFCFCFCFCUL)>>2);\
868
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
869
            pixels+=line_size;\
870
            block +=line_size;\
871
        }\
872
        pixels+=4-line_size*(h+1);\
873
        block +=4-line_size*h;\
874
    }\
875
}\
876
\
877
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
878
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
879
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
880
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
881
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
882
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
883
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
884
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
885

    
886
#define op_avg(a, b) a = rnd_avg32(a, b)
887
#endif
888
#define op_put(a, b) a = b
889

    
890
PIXOP2(avg, op_avg)
891
PIXOP2(put, op_put)
892
#undef op_avg
893
#undef op_put
894

    
895
#define avg2(a,b) ((a+b+1)>>1)
896
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
897

    
898
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
899
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
900
}
901

    
902
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
903
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
904
}
905

    
906
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
907
{
908
    const int A=(16-x16)*(16-y16);
909
    const int B=(   x16)*(16-y16);
910
    const int C=(16-x16)*(   y16);
911
    const int D=(   x16)*(   y16);
912
    int i;
913

    
914
    for(i=0; i<h; i++)
915
    {
916
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
917
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
918
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
919
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
920
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
921
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
922
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
923
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
924
        dst+= stride;
925
        src+= stride;
926
    }
927
}
928

    
929
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
930
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
931
{
932
    int y, vx, vy;
933
    const int s= 1<<shift;
934
    
935
    width--;
936
    height--;
937

    
938
    for(y=0; y<h; y++){
939
        int x;
940

    
941
        vx= ox;
942
        vy= oy;
943
        for(x=0; x<8; x++){ //XXX FIXME optimize
944
            int src_x, src_y, frac_x, frac_y, index;
945

    
946
            src_x= vx>>16;
947
            src_y= vy>>16;
948
            frac_x= src_x&(s-1);
949
            frac_y= src_y&(s-1);
950
            src_x>>=shift;
951
            src_y>>=shift;
952
  
953
            if((unsigned)src_x < width){
954
                if((unsigned)src_y < height){
955
                    index= src_x + src_y*stride;
956
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
957
                                           + src[index       +1]*   frac_x )*(s-frac_y)
958
                                        + (  src[index+stride  ]*(s-frac_x)
959
                                           + src[index+stride+1]*   frac_x )*   frac_y
960
                                        + r)>>(shift*2);
961
                }else{
962
                    index= src_x + clip(src_y, 0, height)*stride;                    
963
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
964
                                          + src[index       +1]*   frac_x )*s
965
                                        + r)>>(shift*2);
966
                }
967
            }else{
968
                if((unsigned)src_y < height){
969
                    index= clip(src_x, 0, width) + src_y*stride;                    
970
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
971
                                           + src[index+stride  ]*   frac_y )*s
972
                                        + r)>>(shift*2);
973
                }else{
974
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
975
                    dst[y*stride + x]=    src[index         ];
976
                }
977
            }
978
            
979
            vx+= dxx;
980
            vy+= dyx;
981
        }
982
        ox += dxy;
983
        oy += dyy;
984
    }
985
}
986

    
987
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988
    switch(width){
989
    case 2: put_pixels2_c (dst, src, stride, height); break;
990
    case 4: put_pixels4_c (dst, src, stride, height); break;
991
    case 8: put_pixels8_c (dst, src, stride, height); break;
992
    case 16:put_pixels16_c(dst, src, stride, height); break;
993
    }
994
}
995

    
996
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
997
    int i,j;
998
    for (i=0; i < height; i++) {
999
      for (j=0; j < width; j++) {
1000
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1001
      }
1002
      src += stride;
1003
      dst += stride;
1004
    }
1005
}
1006

    
1007
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1008
    int i,j;
1009
    for (i=0; i < height; i++) {
1010
      for (j=0; j < width; j++) {
1011
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1012
      }
1013
      src += stride;
1014
      dst += stride;
1015
    }
1016
}
1017
    
1018
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1019
    int i,j;
1020
    for (i=0; i < height; i++) {
1021
      for (j=0; j < width; j++) {
1022
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1023
      }
1024
      src += stride;
1025
      dst += stride;
1026
    }
1027
}
1028
    
1029
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1030
    int i,j;
1031
    for (i=0; i < height; i++) {
1032
      for (j=0; j < width; j++) {
1033
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1034
      }
1035
      src += stride;
1036
      dst += stride;
1037
    }
1038
}
1039

    
1040
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1041
    int i,j;
1042
    for (i=0; i < height; i++) {
1043
      for (j=0; j < width; j++) {
1044
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1045
      }
1046
      src += stride;
1047
      dst += stride;
1048
    }
1049
}
1050

    
1051
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052
    int i,j;
1053
    for (i=0; i < height; i++) {
1054
      for (j=0; j < width; j++) {
1055
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1056
      }
1057
      src += stride;
1058
      dst += stride;
1059
    }
1060
}
1061

    
1062
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063
    int i,j;
1064
    for (i=0; i < height; i++) {
1065
      for (j=0; j < width; j++) {
1066
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1067
      }
1068
      src += stride;
1069
      dst += stride;
1070
    }
1071
}
1072

    
1073
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074
    int i,j;
1075
    for (i=0; i < height; i++) {
1076
      for (j=0; j < width; j++) {
1077
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1078
      }
1079
      src += stride;
1080
      dst += stride;
1081
    }
1082
}
1083

    
1084
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085
    switch(width){
1086
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1087
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1088
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1089
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1090
    }
1091
}
1092

    
1093
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1094
    int i,j;
1095
    for (i=0; i < height; i++) {
1096
      for (j=0; j < width; j++) {
1097
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1098
      }
1099
      src += stride;
1100
      dst += stride;
1101
    }
1102
}
1103

    
1104
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1105
    int i,j;
1106
    for (i=0; i < height; i++) {
1107
      for (j=0; j < width; j++) {
1108
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1109
      }
1110
      src += stride;
1111
      dst += stride;
1112
    }
1113
}
1114
    
1115
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1116
    int i,j;
1117
    for (i=0; i < height; i++) {
1118
      for (j=0; j < width; j++) {
1119
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1120
      }
1121
      src += stride;
1122
      dst += stride;
1123
    }
1124
}
1125
    
1126
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1127
    int i,j;
1128
    for (i=0; i < height; i++) {
1129
      for (j=0; j < width; j++) {
1130
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1131
      }
1132
      src += stride;
1133
      dst += stride;
1134
    }
1135
}
1136

    
1137
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138
    int i,j;
1139
    for (i=0; i < height; i++) {
1140
      for (j=0; j < width; j++) {
1141
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1142
      }
1143
      src += stride;
1144
      dst += stride;
1145
    }
1146
}
1147

    
1148
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1149
    int i,j;
1150
    for (i=0; i < height; i++) {
1151
      for (j=0; j < width; j++) {
1152
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1153
      }
1154
      src += stride;
1155
      dst += stride;
1156
    }
1157
}
1158

    
1159
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1160
    int i,j;
1161
    for (i=0; i < height; i++) {
1162
      for (j=0; j < width; j++) {
1163
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1164
      }
1165
      src += stride;
1166
      dst += stride;
1167
    }
1168
}
1169

    
1170
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171
    int i,j;
1172
    for (i=0; i < height; i++) {
1173
      for (j=0; j < width; j++) {
1174
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1175
      }
1176
      src += stride;
1177
      dst += stride;
1178
    }
1179
}
1180
#if 0
1181
#define TPEL_WIDTH(width)\
1182
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1200
#endif
1201

    
1202
#define H264_CHROMA_MC(OPNAME, OP)\
1203
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204
    const int A=(8-x)*(8-y);\
1205
    const int B=(  x)*(8-y);\
1206
    const int C=(8-x)*(  y);\
1207
    const int D=(  x)*(  y);\
1208
    int i;\
1209
    \
1210
    assert(x<8 && y<8 && x>=0 && y>=0);\
1211
\
1212
    for(i=0; i<h; i++)\
1213
    {\
1214
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1216
        dst+= stride;\
1217
        src+= stride;\
1218
    }\
1219
}\
1220
\
1221
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222
    const int A=(8-x)*(8-y);\
1223
    const int B=(  x)*(8-y);\
1224
    const int C=(8-x)*(  y);\
1225
    const int D=(  x)*(  y);\
1226
    int i;\
1227
    \
1228
    assert(x<8 && y<8 && x>=0 && y>=0);\
1229
\
1230
    for(i=0; i<h; i++)\
1231
    {\
1232
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1236
        dst+= stride;\
1237
        src+= stride;\
1238
    }\
1239
}\
1240
\
1241
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242
    const int A=(8-x)*(8-y);\
1243
    const int B=(  x)*(8-y);\
1244
    const int C=(8-x)*(  y);\
1245
    const int D=(  x)*(  y);\
1246
    int i;\
1247
    \
1248
    assert(x<8 && y<8 && x>=0 && y>=0);\
1249
\
1250
    for(i=0; i<h; i++)\
1251
    {\
1252
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1260
        dst+= stride;\
1261
        src+= stride;\
1262
    }\
1263
}
1264

    
1265
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266
#define op_put(a, b) a = (((b) + 32)>>6)
1267

    
1268
H264_CHROMA_MC(put_       , op_put)
1269
H264_CHROMA_MC(avg_       , op_avg)
1270
#undef op_avg
1271
#undef op_put
1272

    
1273
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1274
{
1275
    int i;
1276
    for(i=0; i<h; i++)
1277
    {
1278
        ST32(dst   , LD32(src   ));
1279
        dst+=dstStride;
1280
        src+=srcStride;
1281
    }
1282
}
1283

    
1284
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1285
{
1286
    int i;
1287
    for(i=0; i<h; i++)
1288
    {
1289
        ST32(dst   , LD32(src   ));
1290
        ST32(dst+4 , LD32(src+4 ));
1291
        dst+=dstStride;
1292
        src+=srcStride;
1293
    }
1294
}
1295

    
1296
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297
{
1298
    int i;
1299
    for(i=0; i<h; i++)
1300
    {
1301
        ST32(dst   , LD32(src   ));
1302
        ST32(dst+4 , LD32(src+4 ));
1303
        ST32(dst+8 , LD32(src+8 ));
1304
        ST32(dst+12, LD32(src+12));
1305
        dst+=dstStride;
1306
        src+=srcStride;
1307
    }
1308
}
1309

    
1310
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1311
{
1312
    int i;
1313
    for(i=0; i<h; i++)
1314
    {
1315
        ST32(dst   , LD32(src   ));
1316
        ST32(dst+4 , LD32(src+4 ));
1317
        ST32(dst+8 , LD32(src+8 ));
1318
        ST32(dst+12, LD32(src+12));
1319
        dst[16]= src[16];
1320
        dst+=dstStride;
1321
        src+=srcStride;
1322
    }
1323
}
1324

    
1325
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1326
{
1327
    int i;
1328
    for(i=0; i<h; i++)
1329
    {
1330
        ST32(dst   , LD32(src   ));
1331
        ST32(dst+4 , LD32(src+4 ));
1332
        dst[8]= src[8];
1333
        dst+=dstStride;
1334
        src+=srcStride;
1335
    }
1336
}
1337

    
1338

    
1339
#define QPEL_MC(r, OPNAME, RND, OP) \
1340
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1342
    int i;\
1343
    for(i=0; i<h; i++)\
1344
    {\
1345
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1353
        dst+=dstStride;\
1354
        src+=srcStride;\
1355
    }\
1356
}\
1357
\
1358
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1359
    const int w=8;\
1360
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1361
    int i;\
1362
    for(i=0; i<w; i++)\
1363
    {\
1364
        const int src0= src[0*srcStride];\
1365
        const int src1= src[1*srcStride];\
1366
        const int src2= src[2*srcStride];\
1367
        const int src3= src[3*srcStride];\
1368
        const int src4= src[4*srcStride];\
1369
        const int src5= src[5*srcStride];\
1370
        const int src6= src[6*srcStride];\
1371
        const int src7= src[7*srcStride];\
1372
        const int src8= src[8*srcStride];\
1373
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1381
        dst++;\
1382
        src++;\
1383
    }\
1384
}\
1385
\
1386
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1388
    int i;\
1389
    \
1390
    for(i=0; i<h; i++)\
1391
    {\
1392
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1408
        dst+=dstStride;\
1409
        src+=srcStride;\
1410
    }\
1411
}\
1412
\
1413
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1415
    int i;\
1416
    const int w=16;\
1417
    for(i=0; i<w; i++)\
1418
    {\
1419
        const int src0= src[0*srcStride];\
1420
        const int src1= src[1*srcStride];\
1421
        const int src2= src[2*srcStride];\
1422
        const int src3= src[3*srcStride];\
1423
        const int src4= src[4*srcStride];\
1424
        const int src5= src[5*srcStride];\
1425
        const int src6= src[6*srcStride];\
1426
        const int src7= src[7*srcStride];\
1427
        const int src8= src[8*srcStride];\
1428
        const int src9= src[9*srcStride];\
1429
        const int src10= src[10*srcStride];\
1430
        const int src11= src[11*srcStride];\
1431
        const int src12= src[12*srcStride];\
1432
        const int src13= src[13*srcStride];\
1433
        const int src14= src[14*srcStride];\
1434
        const int src15= src[15*srcStride];\
1435
        const int src16= src[16*srcStride];\
1436
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1452
        dst++;\
1453
        src++;\
1454
    }\
1455
}\
1456
\
1457
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1458
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1459
}\
1460
\
1461
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1462
    uint8_t half[64];\
1463
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1465
}\
1466
\
1467
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1468
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1469
}\
1470
\
1471
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1472
    uint8_t half[64];\
1473
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1475
}\
1476
\
1477
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478
    uint8_t full[16*9];\
1479
    uint8_t half[64];\
1480
    copy_block9(full, src, 16, stride, 9);\
1481
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1482
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1483
}\
1484
\
1485
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486
    uint8_t full[16*9];\
1487
    copy_block9(full, src, 16, stride, 9);\
1488
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1489
}\
1490
\
1491
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492
    uint8_t full[16*9];\
1493
    uint8_t half[64];\
1494
    copy_block9(full, src, 16, stride, 9);\
1495
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1496
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1497
}\
1498
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499
    uint8_t full[16*9];\
1500
    uint8_t halfH[72];\
1501
    uint8_t halfV[64];\
1502
    uint8_t halfHV[64];\
1503
    copy_block9(full, src, 16, stride, 9);\
1504
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508
}\
1509
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510
    uint8_t full[16*9];\
1511
    uint8_t halfH[72];\
1512
    uint8_t halfHV[64];\
1513
    copy_block9(full, src, 16, stride, 9);\
1514
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518
}\
1519
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520
    uint8_t full[16*9];\
1521
    uint8_t halfH[72];\
1522
    uint8_t halfV[64];\
1523
    uint8_t halfHV[64];\
1524
    copy_block9(full, src, 16, stride, 9);\
1525
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529
}\
1530
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531
    uint8_t full[16*9];\
1532
    uint8_t halfH[72];\
1533
    uint8_t halfHV[64];\
1534
    copy_block9(full, src, 16, stride, 9);\
1535
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1539
}\
1540
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541
    uint8_t full[16*9];\
1542
    uint8_t halfH[72];\
1543
    uint8_t halfV[64];\
1544
    uint8_t halfHV[64];\
1545
    copy_block9(full, src, 16, stride, 9);\
1546
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1547
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550
}\
1551
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552
    uint8_t full[16*9];\
1553
    uint8_t halfH[72];\
1554
    uint8_t halfHV[64];\
1555
    copy_block9(full, src, 16, stride, 9);\
1556
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560
}\
1561
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562
    uint8_t full[16*9];\
1563
    uint8_t halfH[72];\
1564
    uint8_t halfV[64];\
1565
    uint8_t halfHV[64];\
1566
    copy_block9(full, src, 16, stride, 9);\
1567
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1568
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1570
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1571
}\
1572
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573
    uint8_t full[16*9];\
1574
    uint8_t halfH[72];\
1575
    uint8_t halfHV[64];\
1576
    copy_block9(full, src, 16, stride, 9);\
1577
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1581
}\
1582
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1583
    uint8_t halfH[72];\
1584
    uint8_t halfHV[64];\
1585
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1586
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1587
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1588
}\
1589
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1590
    uint8_t halfH[72];\
1591
    uint8_t halfHV[64];\
1592
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1593
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1594
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1595
}\
1596
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597
    uint8_t full[16*9];\
1598
    uint8_t halfH[72];\
1599
    uint8_t halfV[64];\
1600
    uint8_t halfHV[64];\
1601
    copy_block9(full, src, 16, stride, 9);\
1602
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1605
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1606
}\
1607
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608
    uint8_t full[16*9];\
1609
    uint8_t halfH[72];\
1610
    copy_block9(full, src, 16, stride, 9);\
1611
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1614
}\
1615
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616
    uint8_t full[16*9];\
1617
    uint8_t halfH[72];\
1618
    uint8_t halfV[64];\
1619
    uint8_t halfHV[64];\
1620
    copy_block9(full, src, 16, stride, 9);\
1621
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1622
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1624
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1625
}\
1626
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627
    uint8_t full[16*9];\
1628
    uint8_t halfH[72];\
1629
    copy_block9(full, src, 16, stride, 9);\
1630
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1633
}\
1634
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1635
    uint8_t halfH[72];\
1636
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1637
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1638
}\
1639
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1640
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1641
}\
1642
\
1643
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644
    uint8_t half[256];\
1645
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1647
}\
1648
\
1649
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1650
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1651
}\
1652
\
1653
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654
    uint8_t half[256];\
1655
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1657
}\
1658
\
1659
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660
    uint8_t full[24*17];\
1661
    uint8_t half[256];\
1662
    copy_block17(full, src, 24, stride, 17);\
1663
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1664
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1665
}\
1666
\
1667
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668
    uint8_t full[24*17];\
1669
    copy_block17(full, src, 24, stride, 17);\
1670
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1671
}\
1672
\
1673
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674
    uint8_t full[24*17];\
1675
    uint8_t half[256];\
1676
    copy_block17(full, src, 24, stride, 17);\
1677
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1678
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1679
}\
1680
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681
    uint8_t full[24*17];\
1682
    uint8_t halfH[272];\
1683
    uint8_t halfV[256];\
1684
    uint8_t halfHV[256];\
1685
    copy_block17(full, src, 24, stride, 17);\
1686
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690
}\
1691
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692
    uint8_t full[24*17];\
1693
    uint8_t halfH[272];\
1694
    uint8_t halfHV[256];\
1695
    copy_block17(full, src, 24, stride, 17);\
1696
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700
}\
1701
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702
    uint8_t full[24*17];\
1703
    uint8_t halfH[272];\
1704
    uint8_t halfV[256];\
1705
    uint8_t halfHV[256];\
1706
    copy_block17(full, src, 24, stride, 17);\
1707
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711
}\
1712
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713
    uint8_t full[24*17];\
1714
    uint8_t halfH[272];\
1715
    uint8_t halfHV[256];\
1716
    copy_block17(full, src, 24, stride, 17);\
1717
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1721
}\
1722
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723
    uint8_t full[24*17];\
1724
    uint8_t halfH[272];\
1725
    uint8_t halfV[256];\
1726
    uint8_t halfHV[256];\
1727
    copy_block17(full, src, 24, stride, 17);\
1728
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1729
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732
}\
1733
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734
    uint8_t full[24*17];\
1735
    uint8_t halfH[272];\
1736
    uint8_t halfHV[256];\
1737
    copy_block17(full, src, 24, stride, 17);\
1738
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742
}\
1743
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744
    uint8_t full[24*17];\
1745
    uint8_t halfH[272];\
1746
    uint8_t halfV[256];\
1747
    uint8_t halfHV[256];\
1748
    copy_block17(full, src, 24, stride, 17);\
1749
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1750
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1752
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1753
}\
1754
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755
    uint8_t full[24*17];\
1756
    uint8_t halfH[272];\
1757
    uint8_t halfHV[256];\
1758
    copy_block17(full, src, 24, stride, 17);\
1759
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1763
}\
1764
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765
    uint8_t halfH[272];\
1766
    uint8_t halfHV[256];\
1767
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1768
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1769
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1770
}\
1771
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772
    uint8_t halfH[272];\
1773
    uint8_t halfHV[256];\
1774
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1775
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1776
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1777
}\
1778
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779
    uint8_t full[24*17];\
1780
    uint8_t halfH[272];\
1781
    uint8_t halfV[256];\
1782
    uint8_t halfHV[256];\
1783
    copy_block17(full, src, 24, stride, 17);\
1784
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1787
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1788
}\
1789
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790
    uint8_t full[24*17];\
1791
    uint8_t halfH[272];\
1792
    copy_block17(full, src, 24, stride, 17);\
1793
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1796
}\
1797
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798
    uint8_t full[24*17];\
1799
    uint8_t halfH[272];\
1800
    uint8_t halfV[256];\
1801
    uint8_t halfHV[256];\
1802
    copy_block17(full, src, 24, stride, 17);\
1803
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1804
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1806
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1807
}\
1808
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809
    uint8_t full[24*17];\
1810
    uint8_t halfH[272];\
1811
    copy_block17(full, src, 24, stride, 17);\
1812
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1815
}\
1816
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817
    uint8_t halfH[272];\
1818
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1819
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1820
}
1821

    
1822
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824
#define op_put(a, b) a = cm[((b) + 16)>>5]
1825
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1826

    
1827
QPEL_MC(0, put_       , _       , op_put)
1828
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829
QPEL_MC(0, avg_       , _       , op_avg)
1830
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1831
#undef op_avg
1832
#undef op_avg_no_rnd
1833
#undef op_put
1834
#undef op_put_no_rnd
1835

    
1836
#if 1
1837
#define H264_LOWPASS(OPNAME, OP, OP2) \
1838
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1839
    const int h=4;\
1840
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1841
    int i;\
1842
    for(i=0; i<h; i++)\
1843
    {\
1844
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1848
        dst+=dstStride;\
1849
        src+=srcStride;\
1850
    }\
1851
}\
1852
\
1853
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854
    const int w=4;\
1855
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1856
    int i;\
1857
    for(i=0; i<w; i++)\
1858
    {\
1859
        const int srcB= src[-2*srcStride];\
1860
        const int srcA= src[-1*srcStride];\
1861
        const int src0= src[0 *srcStride];\
1862
        const int src1= src[1 *srcStride];\
1863
        const int src2= src[2 *srcStride];\
1864
        const int src3= src[3 *srcStride];\
1865
        const int src4= src[4 *srcStride];\
1866
        const int src5= src[5 *srcStride];\
1867
        const int src6= src[6 *srcStride];\
1868
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1872
        dst++;\
1873
        src++;\
1874
    }\
1875
}\
1876
\
1877
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1878
    const int h=4;\
1879
    const int w=4;\
1880
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1881
    int i;\
1882
    src -= 2*srcStride;\
1883
    for(i=0; i<h+5; i++)\
1884
    {\
1885
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1889
        tmp+=tmpStride;\
1890
        src+=srcStride;\
1891
    }\
1892
    tmp -= tmpStride*(h+5-2);\
1893
    for(i=0; i<w; i++)\
1894
    {\
1895
        const int tmpB= tmp[-2*tmpStride];\
1896
        const int tmpA= tmp[-1*tmpStride];\
1897
        const int tmp0= tmp[0 *tmpStride];\
1898
        const int tmp1= tmp[1 *tmpStride];\
1899
        const int tmp2= tmp[2 *tmpStride];\
1900
        const int tmp3= tmp[3 *tmpStride];\
1901
        const int tmp4= tmp[4 *tmpStride];\
1902
        const int tmp5= tmp[5 *tmpStride];\
1903
        const int tmp6= tmp[6 *tmpStride];\
1904
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1908
        dst++;\
1909
        tmp++;\
1910
    }\
1911
}\
1912
\
1913
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1914
    const int h=8;\
1915
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1916
    int i;\
1917
    for(i=0; i<h; i++)\
1918
    {\
1919
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1927
        dst+=dstStride;\
1928
        src+=srcStride;\
1929
    }\
1930
}\
1931
\
1932
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1933
    const int w=8;\
1934
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1935
    int i;\
1936
    for(i=0; i<w; i++)\
1937
    {\
1938
        const int srcB= src[-2*srcStride];\
1939
        const int srcA= src[-1*srcStride];\
1940
        const int src0= src[0 *srcStride];\
1941
        const int src1= src[1 *srcStride];\
1942
        const int src2= src[2 *srcStride];\
1943
        const int src3= src[3 *srcStride];\
1944
        const int src4= src[4 *srcStride];\
1945
        const int src5= src[5 *srcStride];\
1946
        const int src6= src[6 *srcStride];\
1947
        const int src7= src[7 *srcStride];\
1948
        const int src8= src[8 *srcStride];\
1949
        const int src9= src[9 *srcStride];\
1950
        const int src10=src[10*srcStride];\
1951
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1959
        dst++;\
1960
        src++;\
1961
    }\
1962
}\
1963
\
1964
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1965
    const int h=8;\
1966
    const int w=8;\
1967
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1968
    int i;\
1969
    src -= 2*srcStride;\
1970
    for(i=0; i<h+5; i++)\
1971
    {\
1972
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1980
        tmp+=tmpStride;\
1981
        src+=srcStride;\
1982
    }\
1983
    tmp -= tmpStride*(h+5-2);\
1984
    for(i=0; i<w; i++)\
1985
    {\
1986
        const int tmpB= tmp[-2*tmpStride];\
1987
        const int tmpA= tmp[-1*tmpStride];\
1988
        const int tmp0= tmp[0 *tmpStride];\
1989
        const int tmp1= tmp[1 *tmpStride];\
1990
        const int tmp2= tmp[2 *tmpStride];\
1991
        const int tmp3= tmp[3 *tmpStride];\
1992
        const int tmp4= tmp[4 *tmpStride];\
1993
        const int tmp5= tmp[5 *tmpStride];\
1994
        const int tmp6= tmp[6 *tmpStride];\
1995
        const int tmp7= tmp[7 *tmpStride];\
1996
        const int tmp8= tmp[8 *tmpStride];\
1997
        const int tmp9= tmp[9 *tmpStride];\
1998
        const int tmp10=tmp[10*tmpStride];\
1999
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2007
        dst++;\
2008
        tmp++;\
2009
    }\
2010
}\
2011
\
2012
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2014
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015
    src += 8*srcStride;\
2016
    dst += 8*dstStride;\
2017
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2018
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2019
}\
2020
\
2021
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2023
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024
    src += 8*srcStride;\
2025
    dst += 8*dstStride;\
2026
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2027
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2028
}\
2029
\
2030
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2032
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033
    src += 8*srcStride;\
2034
    tmp += 8*tmpStride;\
2035
    dst += 8*dstStride;\
2036
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2037
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2038
}\
2039

    
2040
#define H264_MC(OPNAME, SIZE) \
2041
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2043
}\
2044
\
2045
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046
    uint8_t half[SIZE*SIZE];\
2047
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2049
}\
2050
\
2051
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2053
}\
2054
\
2055
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056
    uint8_t half[SIZE*SIZE];\
2057
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2059
}\
2060
\
2061
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062
    uint8_t full[SIZE*(SIZE+5)];\
2063
    uint8_t * const full_mid= full + SIZE*2;\
2064
    uint8_t half[SIZE*SIZE];\
2065
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2066
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2068
}\
2069
\
2070
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071
    uint8_t full[SIZE*(SIZE+5)];\
2072
    uint8_t * const full_mid= full + SIZE*2;\
2073
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2074
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2075
}\
2076
\
2077
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078
    uint8_t full[SIZE*(SIZE+5)];\
2079
    uint8_t * const full_mid= full + SIZE*2;\
2080
    uint8_t half[SIZE*SIZE];\
2081
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2082
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2084
}\
2085
\
2086
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087
    uint8_t full[SIZE*(SIZE+5)];\
2088
    uint8_t * const full_mid= full + SIZE*2;\
2089
    uint8_t halfH[SIZE*SIZE];\
2090
    uint8_t halfV[SIZE*SIZE];\
2091
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2093
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2095
}\
2096
\
2097
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098
    uint8_t full[SIZE*(SIZE+5)];\
2099
    uint8_t * const full_mid= full + SIZE*2;\
2100
    uint8_t halfH[SIZE*SIZE];\
2101
    uint8_t halfV[SIZE*SIZE];\
2102
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2104
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2106
}\
2107
\
2108
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109
    uint8_t full[SIZE*(SIZE+5)];\
2110
    uint8_t * const full_mid= full + SIZE*2;\
2111
    uint8_t halfH[SIZE*SIZE];\
2112
    uint8_t halfV[SIZE*SIZE];\
2113
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2115
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2117
}\
2118
\
2119
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120
    uint8_t full[SIZE*(SIZE+5)];\
2121
    uint8_t * const full_mid= full + SIZE*2;\
2122
    uint8_t halfH[SIZE*SIZE];\
2123
    uint8_t halfV[SIZE*SIZE];\
2124
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2126
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2128
}\
2129
\
2130
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131
    int16_t tmp[SIZE*(SIZE+5)];\
2132
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2133
}\
2134
\
2135
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136
    int16_t tmp[SIZE*(SIZE+5)];\
2137
    uint8_t halfH[SIZE*SIZE];\
2138
    uint8_t halfHV[SIZE*SIZE];\
2139
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2142
}\
2143
\
2144
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145
    int16_t tmp[SIZE*(SIZE+5)];\
2146
    uint8_t halfH[SIZE*SIZE];\
2147
    uint8_t halfHV[SIZE*SIZE];\
2148
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2151
}\
2152
\
2153
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154
    uint8_t full[SIZE*(SIZE+5)];\
2155
    uint8_t * const full_mid= full + SIZE*2;\
2156
    int16_t tmp[SIZE*(SIZE+5)];\
2157
    uint8_t halfV[SIZE*SIZE];\
2158
    uint8_t halfHV[SIZE*SIZE];\
2159
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2160
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2163
}\
2164
\
2165
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166
    uint8_t full[SIZE*(SIZE+5)];\
2167
    uint8_t * const full_mid= full + SIZE*2;\
2168
    int16_t tmp[SIZE*(SIZE+5)];\
2169
    uint8_t halfV[SIZE*SIZE];\
2170
    uint8_t halfHV[SIZE*SIZE];\
2171
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2172
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2175
}\
2176

    
2177
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2180
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2182

    
2183
H264_LOWPASS(put_       , op_put, op2_put)
2184
H264_LOWPASS(avg_       , op_avg, op2_avg)
2185
H264_MC(put_, 4)
2186
H264_MC(put_, 8)
2187
H264_MC(put_, 16)
2188
H264_MC(avg_, 4)
2189
H264_MC(avg_, 8)
2190
H264_MC(avg_, 16)
2191

    
2192
#undef op_avg
2193
#undef op_put
2194
#undef op2_avg
2195
#undef op2_put
2196
#endif
2197

    
2198
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200
    int i;
2201

    
2202
    for(i=0; i<h; i++){
2203
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2211
        dst+=dstStride;
2212
        src+=srcStride;        
2213
    }
2214
}
2215

    
2216
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2218
    int i;
2219

    
2220
    for(i=0; i<w; i++){
2221
        const int src_1= src[ -srcStride];
2222
        const int src0 = src[0          ];
2223
        const int src1 = src[  srcStride];
2224
        const int src2 = src[2*srcStride];
2225
        const int src3 = src[3*srcStride];
2226
        const int src4 = src[4*srcStride];
2227
        const int src5 = src[5*srcStride];
2228
        const int src6 = src[6*srcStride];
2229
        const int src7 = src[7*srcStride];
2230
        const int src8 = src[8*srcStride];
2231
        const int src9 = src[9*srcStride];
2232
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2234
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2235
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2236
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2237
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2238
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2239
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2240
        src++;
2241
        dst++;
2242
    }
2243
}
2244

    
2245
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246
    put_pixels8_c(dst, src, stride, 8);
2247
}
2248

    
2249
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2250
    uint8_t half[64];
2251
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2253
}
2254

    
2255
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2257
}
2258

    
2259
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2260
    uint8_t half[64];
2261
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2263
}
2264

    
2265
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2267
}
2268

    
2269
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2270
    uint8_t halfH[88];
2271
    uint8_t halfV[64];
2272
    uint8_t halfHV[64];
2273
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2277
}
2278
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2279
    uint8_t halfH[88];
2280
    uint8_t halfV[64];
2281
    uint8_t halfHV[64];
2282
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2286
}
2287
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2288
    uint8_t halfH[88];
2289
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2291
}
2292

    
2293
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2294
    int x;
2295
    const int strength= ff_h263_loop_filter_strength[qscale];
2296
    
2297
    for(x=0; x<8; x++){
2298
        int d1, d2, ad1;
2299
        int p0= src[x-2*stride];
2300
        int p1= src[x-1*stride];
2301
        int p2= src[x+0*stride];
2302
        int p3= src[x+1*stride];
2303
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2304

    
2305
        if     (d<-2*strength) d1= 0;
2306
        else if(d<-  strength) d1=-2*strength - d;
2307
        else if(d<   strength) d1= d;
2308
        else if(d< 2*strength) d1= 2*strength - d;
2309
        else                   d1= 0;
2310
        
2311
        p1 += d1;
2312
        p2 -= d1;
2313
        if(p1&256) p1= ~(p1>>31);
2314
        if(p2&256) p2= ~(p2>>31);
2315
        
2316
        src[x-1*stride] = p1;
2317
        src[x+0*stride] = p2;
2318

    
2319
        ad1= ABS(d1)>>1;
2320
        
2321
        d2= clip((p0-p3)/4, -ad1, ad1);
2322
        
2323
        src[x-2*stride] = p0 - d2;
2324
        src[x+  stride] = p3 + d2;
2325
    }
2326
}
2327

    
2328
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2329
    int y;
2330
    const int strength= ff_h263_loop_filter_strength[qscale];
2331
    
2332
    for(y=0; y<8; y++){
2333
        int d1, d2, ad1;
2334
        int p0= src[y*stride-2];
2335
        int p1= src[y*stride-1];
2336
        int p2= src[y*stride+0];
2337
        int p3= src[y*stride+1];
2338
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2339

    
2340
        if     (d<-2*strength) d1= 0;
2341
        else if(d<-  strength) d1=-2*strength - d;
2342
        else if(d<   strength) d1= d;
2343
        else if(d< 2*strength) d1= 2*strength - d;
2344
        else                   d1= 0;
2345
        
2346
        p1 += d1;
2347
        p2 -= d1;
2348
        if(p1&256) p1= ~(p1>>31);
2349
        if(p2&256) p2= ~(p2>>31);
2350
        
2351
        src[y*stride-1] = p1;
2352
        src[y*stride+0] = p2;
2353

    
2354
        ad1= ABS(d1)>>1;
2355
        
2356
        d2= clip((p0-p3)/4, -ad1, ad1);
2357
        
2358
        src[y*stride-2] = p0 - d2;
2359
        src[y*stride+1] = p3 + d2;
2360
    }
2361
}
2362

    
2363
static void h261_loop_filter_c(uint8_t *src, int stride){
2364
    int x,y,xy,yz;
2365
    int temp[64];
2366

    
2367
    for(x=0; x<8; x++){
2368
        temp[x      ] = 4*src[x           ];
2369
        temp[x + 7*8] = 4*src[x + 7*stride];
2370
    }
2371
    for(y=1; y<7; y++){
2372
        for(x=0; x<8; x++){
2373
            xy = y * stride + x;
2374
            yz = y * 8 + x;
2375
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2376
        }
2377
    }
2378
        
2379
    for(y=0; y<8; y++){
2380
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2381
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2382
        for(x=1; x<7; x++){
2383
            xy = y * stride + x;
2384
            yz = y * 8 + x;
2385
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2386
        }
2387
    }
2388
}
2389

    
2390
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2391
{
2392
    int s, i;
2393

    
2394
    s = 0;
2395
    for(i=0;i<h;i++) {
2396
        s += abs(pix1[0] - pix2[0]);
2397
        s += abs(pix1[1] - pix2[1]);
2398
        s += abs(pix1[2] - pix2[2]);
2399
        s += abs(pix1[3] - pix2[3]);
2400
        s += abs(pix1[4] - pix2[4]);
2401
        s += abs(pix1[5] - pix2[5]);
2402
        s += abs(pix1[6] - pix2[6]);
2403
        s += abs(pix1[7] - pix2[7]);
2404
        s += abs(pix1[8] - pix2[8]);
2405
        s += abs(pix1[9] - pix2[9]);
2406
        s += abs(pix1[10] - pix2[10]);
2407
        s += abs(pix1[11] - pix2[11]);
2408
        s += abs(pix1[12] - pix2[12]);
2409
        s += abs(pix1[13] - pix2[13]);
2410
        s += abs(pix1[14] - pix2[14]);
2411
        s += abs(pix1[15] - pix2[15]);
2412
        pix1 += line_size;
2413
        pix2 += line_size;
2414
    }
2415
    return s;
2416
}
2417

    
2418
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2419
{
2420
    int s, i;
2421

    
2422
    s = 0;
2423
    for(i=0;i<h;i++) {
2424
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2425
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2426
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2427
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2428
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2429
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2430
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2431
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2432
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2433
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2434
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2435
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2436
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2437
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2438
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2439
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2440
        pix1 += line_size;
2441
        pix2 += line_size;
2442
    }
2443
    return s;
2444
}
2445

    
2446
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2447
{
2448
    int s, i;
2449
    uint8_t *pix3 = pix2 + line_size;
2450

    
2451
    s = 0;
2452
    for(i=0;i<h;i++) {
2453
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2454
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2455
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2456
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2457
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2458
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2459
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2460
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2461
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2462
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2463
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2464
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2465
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2466
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2467
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2468
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2469
        pix1 += line_size;
2470
        pix2 += line_size;
2471
        pix3 += line_size;
2472
    }
2473
    return s;
2474
}
2475

    
2476
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2477
{
2478
    int s, i;
2479
    uint8_t *pix3 = pix2 + line_size;
2480

    
2481
    s = 0;
2482
    for(i=0;i<h;i++) {
2483
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2484
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2485
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2486
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2487
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2488
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2489
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2490
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2491
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2492
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2493
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2494
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2495
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2496
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2497
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2498
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2499
        pix1 += line_size;
2500
        pix2 += line_size;
2501
        pix3 += line_size;
2502
    }
2503
    return s;
2504
}
2505

    
2506
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2507
{
2508
    int s, i;
2509

    
2510
    s = 0;
2511
    for(i=0;i<h;i++) {
2512
        s += abs(pix1[0] - pix2[0]);
2513
        s += abs(pix1[1] - pix2[1]);
2514
        s += abs(pix1[2] - pix2[2]);
2515
        s += abs(pix1[3] - pix2[3]);
2516
        s += abs(pix1[4] - pix2[4]);
2517
        s += abs(pix1[5] - pix2[5]);
2518
        s += abs(pix1[6] - pix2[6]);
2519
        s += abs(pix1[7] - pix2[7]);
2520
        pix1 += line_size;
2521
        pix2 += line_size;
2522
    }
2523
    return s;
2524
}
2525

    
2526
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2527
{
2528
    int s, i;
2529

    
2530
    s = 0;
2531
    for(i=0;i<h;i++) {
2532
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2533
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2534
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2535
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2536
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2537
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2538
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2539
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2540
        pix1 += line_size;
2541
        pix2 += line_size;
2542
    }
2543
    return s;
2544
}
2545

    
2546
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2547
{
2548
    int s, i;
2549
    uint8_t *pix3 = pix2 + line_size;
2550

    
2551
    s = 0;
2552
    for(i=0;i<h;i++) {
2553
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2554
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2555
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2556
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2557
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2558
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2559
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2560
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2561
        pix1 += line_size;
2562
        pix2 += line_size;
2563
        pix3 += line_size;
2564
    }
2565
    return s;
2566
}
2567

    
2568
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2569
{
2570
    int s, i;
2571
    uint8_t *pix3 = pix2 + line_size;
2572

    
2573
    s = 0;
2574
    for(i=0;i<h;i++) {
2575
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2576
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2577
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2578
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2579
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2580
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2581
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2582
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2583
        pix1 += line_size;
2584
        pix2 += line_size;
2585
        pix3 += line_size;
2586
    }
2587
    return s;
2588
}
2589

    
2590
static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2591
    int score1=0;
2592
    int score2=0;
2593
    int x,y;
2594

    
2595
    for(y=0; y<h; y++){
2596
        for(x=0; x<16; x++){
2597
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2598
        }
2599
        if(y+1<h){
2600
            for(x=0; x<15; x++){
2601
                score2+= ABS(  s1[x  ] - s1[x  +stride]
2602
                             - s1[x+1] + s1[x+1+stride])
2603
                        -ABS(  s2[x  ] - s2[x  +stride]
2604
                             - s2[x+1] + s2[x+1+stride]);
2605
            }
2606
        }
2607
        s1+= stride;
2608
        s2+= stride;
2609
    }
2610

    
2611
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2612
    else  return score1 + ABS(score2)*8;
2613
}
2614

    
2615
static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2616
    int score1=0;
2617
    int score2=0;
2618
    int x,y;
2619
    
2620
    for(y=0; y<h; y++){
2621
        for(x=0; x<8; x++){
2622
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2623
        }
2624
        if(y+1<h){
2625
            for(x=0; x<7; x++){
2626
                score2+= ABS(  s1[x  ] - s1[x  +stride]
2627
                             - s1[x+1] + s1[x+1+stride])
2628
                        -ABS(  s2[x  ] - s2[x  +stride]
2629
                             - s2[x+1] + s2[x+1+stride]);
2630
            }
2631
        }
2632
        s1+= stride;
2633
        s2+= stride;
2634
    }
2635
    
2636
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2637
    else  return score1 + ABS(score2)*8;
2638
}
2639

    
2640
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2641
    int i;
2642
    unsigned int sum=0;
2643

    
2644
    for(i=0; i<8*8; i++){
2645
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2646
        int w= weight[i];
2647
        b>>= RECON_SHIFT;
2648
        assert(-512<b && b<512);
2649

    
2650
        sum += (w*b)*(w*b)>>4;
2651
    }
2652
    return sum>>2;
2653
}
2654

    
2655
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2656
    int i;
2657

    
2658
    for(i=0; i<8*8; i++){
2659
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2660
    }    
2661
}
2662

    
2663
/**
2664
 * permutes an 8x8 block.
2665
 * @param block the block which will be permuted according to the given permutation vector
2666
 * @param permutation the permutation vector
2667
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2668
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2669
 *                  (inverse) permutated to scantable order!
2670
 */
2671
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2672
{
2673
    int i;
2674
    DCTELEM temp[64];
2675
    
2676
    if(last<=0) return;
2677
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2678

    
2679
    for(i=0; i<=last; i++){
2680
        const int j= scantable[i];
2681
        temp[j]= block[j];
2682
        block[j]=0;
2683
    }
2684
    
2685
    for(i=0; i<=last; i++){
2686
        const int j= scantable[i];
2687
        const int perm_j= permutation[j];
2688
        block[perm_j]= temp[j];
2689
    }
2690
}
2691

    
2692
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2693
    return 0;
2694
}
2695

    
2696
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2697
    int i;
2698
    
2699
    memset(cmp, 0, sizeof(void*)*5);
2700
        
2701
    for(i=0; i<5; i++){
2702
        switch(type&0xFF){
2703
        case FF_CMP_SAD:
2704
            cmp[i]= c->sad[i];
2705
            break;
2706
        case FF_CMP_SATD:
2707
            cmp[i]= c->hadamard8_diff[i];
2708
            break;
2709
        case FF_CMP_SSE:
2710
            cmp[i]= c->sse[i];
2711
            break;
2712
        case FF_CMP_DCT:
2713
            cmp[i]= c->dct_sad[i];
2714
            break;
2715
        case FF_CMP_PSNR:
2716
            cmp[i]= c->quant_psnr[i];
2717
            break;
2718
        case FF_CMP_BIT:
2719
            cmp[i]= c->bit[i];
2720
            break;
2721
        case FF_CMP_RD:
2722
            cmp[i]= c->rd[i];
2723
            break;
2724
        case FF_CMP_VSAD:
2725
            cmp[i]= c->vsad[i];
2726
            break;
2727
        case FF_CMP_VSSE:
2728
            cmp[i]= c->vsse[i];
2729
            break;
2730
        case FF_CMP_ZERO:
2731
            cmp[i]= zero_cmp;
2732
            break;
2733
        case FF_CMP_NSSE:
2734
            cmp[i]= c->nsse[i];
2735
            break;
2736
        default:
2737
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2738
        }
2739
    }
2740
}
2741

    
2742
/**
2743
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2744
 */
2745
static void clear_blocks_c(DCTELEM *blocks)
2746
{
2747
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2748
}
2749

    
2750
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2751
    int i;
2752
    for(i=0; i+7<w; i+=8){
2753
        dst[i+0] += src[i+0];
2754
        dst[i+1] += src[i+1];
2755
        dst[i+2] += src[i+2];
2756
        dst[i+3] += src[i+3];
2757
        dst[i+4] += src[i+4];
2758
        dst[i+5] += src[i+5];
2759
        dst[i+6] += src[i+6];
2760
        dst[i+7] += src[i+7];
2761
    }
2762
    for(; i<w; i++)
2763
        dst[i+0] += src[i+0];
2764
}
2765

    
2766
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2767
    int i;
2768
    for(i=0; i+7<w; i+=8){
2769
        dst[i+0] = src1[i+0]-src2[i+0];
2770
        dst[i+1] = src1[i+1]-src2[i+1];
2771
        dst[i+2] = src1[i+2]-src2[i+2];
2772
        dst[i+3] = src1[i+3]-src2[i+3];
2773
        dst[i+4] = src1[i+4]-src2[i+4];
2774
        dst[i+5] = src1[i+5]-src2[i+5];
2775
        dst[i+6] = src1[i+6]-src2[i+6];
2776
        dst[i+7] = src1[i+7]-src2[i+7];
2777
    }
2778
    for(; i<w; i++)
2779
        dst[i+0] = src1[i+0]-src2[i+0];
2780
}
2781

    
2782
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2783
    int i;
2784
    uint8_t l, lt;
2785

    
2786
    l= *left;
2787
    lt= *left_top;
2788

    
2789
    for(i=0; i<w; i++){
2790
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2791
        lt= src1[i];
2792
        l= src2[i];
2793
        dst[i]= l - pred;
2794
    }    
2795

    
2796
    *left= l;
2797
    *left_top= lt;
2798
}
2799

    
2800
#define BUTTERFLY2(o1,o2,i1,i2) \
2801
o1= (i1)+(i2);\
2802
o2= (i1)-(i2);
2803

    
2804
#define BUTTERFLY1(x,y) \
2805
{\
2806
    int a,b;\
2807
    a= x;\
2808
    b= y;\
2809
    x= a+b;\
2810
    y= a-b;\
2811
}
2812

    
2813
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2814

    
2815
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2816
    int i;
2817
    int temp[64];
2818
    int sum=0;
2819
    
2820
    assert(h==8);
2821

    
2822
    for(i=0; i<8; i++){
2823
        //FIXME try pointer walks
2824
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2825
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2826
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2827
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2828
        
2829
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2830
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2831
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2832
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2833
        
2834
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2835
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2836
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2837
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2838
    }
2839

    
2840
    for(i=0; i<8; i++){
2841
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2842
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2843
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2844
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2845
        
2846
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2847
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2848
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2849
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2850

    
2851
        sum += 
2852
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2853
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2854
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2855
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2856
    }
2857
#if 0
2858
static int maxi=0;
2859
if(sum>maxi){
2860
    maxi=sum;
2861
    printf("MAX:%d\n", maxi);
2862
}
2863
#endif
2864
    return sum;
2865
}
2866

    
2867
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2868
    int i;
2869
    int temp[64];
2870
    int sum=0;
2871
    
2872
    assert(h==8);
2873
    
2874
    for(i=0; i<8; i++){
2875
        //FIXME try pointer walks
2876
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2877
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2878
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2879
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2880
        
2881
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2882
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2883
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2884
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2885
        
2886
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2887
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2888
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2889
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2890
    }
2891

    
2892
    for(i=0; i<8; i++){
2893
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2894
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2895
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2896
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2897
        
2898
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2899
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2900
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2901
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2902
    
2903
        sum += 
2904
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2905
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2906
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2907
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2908
    }
2909
    
2910
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2911
    
2912
    return sum;
2913
}
2914

    
2915
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2916
    MpegEncContext * const s= (MpegEncContext *)c;
2917
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2918
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2919
    int sum=0, i;
2920
    
2921
    assert(h==8);
2922

    
2923
    s->dsp.diff_pixels(temp, src1, src2, stride);
2924
    s->dsp.fdct(temp);
2925

    
2926
    for(i=0; i<64; i++)
2927
        sum+= ABS(temp[i]);
2928
        
2929
    return sum;
2930
}
2931

    
2932
void simple_idct(DCTELEM *block); //FIXME
2933

    
2934
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2935
    MpegEncContext * const s= (MpegEncContext *)c;
2936
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2937
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2939
    int sum=0, i;
2940

    
2941
    assert(h==8);
2942
    s->mb_intra=0;
2943
    
2944
    s->dsp.diff_pixels(temp, src1, src2, stride);
2945
    
2946
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2947
    
2948
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2949
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2950
    simple_idct(temp); //FIXME 
2951
    
2952
    for(i=0; i<64; i++)
2953
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2954
        
2955
    return sum;
2956
}
2957

    
2958
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2959
    MpegEncContext * const s= (MpegEncContext *)c;
2960
    const uint8_t *scantable= s->intra_scantable.permutated;
2961
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2962
    uint64_t __align8 aligned_bak[stride];
2963
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2964
    uint8_t * const bak= (uint8_t*)aligned_bak;
2965
    int i, last, run, bits, level, distoration, start_i;
2966
    const int esc_length= s->ac_esc_length;
2967
    uint8_t * length;
2968
    uint8_t * last_length;
2969
    
2970
    assert(h==8);
2971

    
2972
    for(i=0; i<8; i++){
2973
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2974
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2975
    }
2976

    
2977
    s->dsp.diff_pixels(temp, src1, src2, stride);
2978

    
2979
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2980

    
2981
    bits=0;
2982
    
2983
    if (s->mb_intra) {
2984
        start_i = 1; 
2985
        length     = s->intra_ac_vlc_length;
2986
        last_length= s->intra_ac_vlc_last_length;
2987
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2988
    } else {
2989
        start_i = 0;
2990
        length     = s->inter_ac_vlc_length;
2991
        last_length= s->inter_ac_vlc_last_length;
2992
    }
2993
    
2994
    if(last>=start_i){
2995
        run=0;
2996
        for(i=start_i; i<last; i++){
2997
            int j= scantable[i];
2998
            level= temp[j];
2999
        
3000
            if(level){
3001
                level+=64;
3002
                if((level&(~127)) == 0){
3003
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3004
                }else
3005
                    bits+= esc_length;
3006
                run=0;
3007
            }else
3008
                run++;
3009
        }
3010
        i= scantable[last];
3011
       
3012
        level= temp[i] + 64;
3013

    
3014
        assert(level - 64);
3015
        
3016
        if((level&(~127)) == 0){
3017
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3018
        }else
3019
            bits+= esc_length;
3020
    
3021
    }
3022

    
3023
    if(last>=0){
3024
        if(s->mb_intra)
3025
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3026
        else
3027
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3028
    }
3029
    
3030
    s->dsp.idct_add(bak, stride, temp);
3031
    
3032
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3033

    
3034
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3035
}
3036

    
3037
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3038
    MpegEncContext * const s= (MpegEncContext *)c;
3039
    const uint8_t *scantable= s->intra_scantable.permutated;
3040
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3041
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3042
    int i, last, run, bits, level, start_i;
3043
    const int esc_length= s->ac_esc_length;
3044
    uint8_t * length;
3045
    uint8_t * last_length;
3046

    
3047
    assert(h==8);
3048
    
3049
    s->dsp.diff_pixels(temp, src1, src2, stride);
3050

    
3051
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3052

    
3053
    bits=0;
3054
    
3055
    if (s->mb_intra) {
3056
        start_i = 1; 
3057
        length     = s->intra_ac_vlc_length;
3058
        last_length= s->intra_ac_vlc_last_length;
3059
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3060
    } else {
3061
        start_i = 0;
3062
        length     = s->inter_ac_vlc_length;
3063
        last_length= s->inter_ac_vlc_last_length;
3064
    }
3065
    
3066
    if(last>=start_i){
3067
        run=0;
3068
        for(i=start_i; i<last; i++){
3069
            int j= scantable[i];
3070
            level= temp[j];
3071
        
3072
            if(level){
3073
                level+=64;
3074
                if((level&(~127)) == 0){
3075
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3076
                }else
3077
                    bits+= esc_length;
3078
                run=0;
3079
            }else
3080
                run++;
3081
        }
3082
        i= scantable[last];
3083
                
3084
        level= temp[i] + 64;
3085
        
3086
        assert(level - 64);
3087
        
3088
        if((level&(~127)) == 0){
3089
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3090
        }else
3091
            bits+= esc_length;
3092
    }
3093

    
3094
    return bits;
3095
}
3096

    
3097
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3098
    int score=0;
3099
    int x,y;
3100
    
3101
    for(y=1; y<h; y++){
3102
        for(x=0; x<16; x+=4){
3103
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride]) 
3104
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3105
        }
3106
        s+= stride;
3107
    }
3108
    
3109
    return score;
3110
}
3111

    
3112
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3113
    int score=0;
3114
    int x,y;
3115
    
3116
    for(y=1; y<h; y++){
3117
        for(x=0; x<16; x++){
3118
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3119
        }
3120
        s1+= stride;
3121
        s2+= stride;
3122
    }
3123
    
3124
    return score;
3125
}
3126

    
3127
#define SQ(a) ((a)*(a))
3128
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3129
    int score=0;
3130
    int x,y;
3131
    
3132
    for(y=1; y<h; y++){
3133
        for(x=0; x<16; x+=4){
3134
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
3135
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3136
        }
3137
        s+= stride;
3138
    }
3139
    
3140
    return score;
3141
}
3142

    
3143
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3144
    int score=0;
3145
    int x,y;
3146
    
3147
    for(y=1; y<h; y++){
3148
        for(x=0; x<16; x++){
3149
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3150
        }
3151
        s1+= stride;
3152
        s2+= stride;
3153
    }
3154
    
3155
    return score;
3156
}
3157

    
3158
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3159
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3160
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3161
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3162
WARPER8_16_SQ(rd8x8_c, rd16_c)
3163
WARPER8_16_SQ(bit8x8_c, bit16_c)
3164

    
3165
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3166
 converted */
3167
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3168
{
3169
    j_rev_dct (block);
3170
    put_pixels_clamped_c(block, dest, line_size);
3171
}
3172
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3173
{
3174
    j_rev_dct (block);
3175
    add_pixels_clamped_c(block, dest, line_size);
3176
}
3177

    
3178
/* init static data */
3179
void dsputil_static_init(void)
3180
{
3181
    int i;
3182

    
3183
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3184
    for(i=0;i<MAX_NEG_CROP;i++) {
3185
        cropTbl[i] = 0;
3186
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3187
    }
3188
    
3189
    for(i=0;i<512;i++) {
3190
        squareTbl[i] = (i - 256) * (i - 256);
3191
    }
3192
    
3193
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3194
}
3195

    
3196

    
3197
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3198
{
3199
    int i;
3200

    
3201
#ifdef CONFIG_ENCODERS
3202
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3203
        c->fdct = fdct_ifast;
3204
        c->fdct248 = fdct_ifast248;
3205
    } 
3206
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3207
        c->fdct = ff_faandct;
3208
        c->fdct248 = ff_faandct248; 
3209
    } 
3210
    else {
3211
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3212
        c->fdct248 = ff_fdct248_islow;
3213
    }
3214
#endif //CONFIG_ENCODERS
3215

    
3216
    if(avctx->idct_algo==FF_IDCT_INT){
3217
        c->idct_put= ff_jref_idct_put;
3218
        c->idct_add= ff_jref_idct_add;
3219
        c->idct    = j_rev_dct;
3220
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3221
    }else{ //accurate/default
3222
        c->idct_put= simple_idct_put;
3223
        c->idct_add= simple_idct_add;
3224
        c->idct    = simple_idct;
3225
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3226
    }
3227

    
3228
    /* VP3 DSP support */
3229
    c->vp3_dsp_init = vp3_dsp_init_c;
3230
    c->vp3_idct = vp3_idct_c;
3231

    
3232
    c->get_pixels = get_pixels_c;
3233
    c->diff_pixels = diff_pixels_c;
3234
    c->put_pixels_clamped = put_pixels_clamped_c;
3235
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3236
    c->add_pixels_clamped = add_pixels_clamped_c;
3237
    c->gmc1 = gmc1_c;
3238
    c->gmc = gmc_c;
3239
    c->clear_blocks = clear_blocks_c;
3240
    c->pix_sum = pix_sum_c;
3241
    c->pix_norm1 = pix_norm1_c;
3242

    
3243
    /* TODO [0] 16  [1] 8 */
3244
    c->pix_abs[0][0] = pix_abs16_c;
3245
    c->pix_abs[0][1] = pix_abs16_x2_c;
3246
    c->pix_abs[0][2] = pix_abs16_y2_c;
3247
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3248
    c->pix_abs[1][0] = pix_abs8_c;
3249
    c->pix_abs[1][1] = pix_abs8_x2_c;
3250
    c->pix_abs[1][2] = pix_abs8_y2_c;
3251
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3252

    
3253
#define dspfunc(PFX, IDX, NUM) \
3254
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3255
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3256
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3257
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3258

    
3259
    dspfunc(put, 0, 16);
3260
    dspfunc(put_no_rnd, 0, 16);
3261
    dspfunc(put, 1, 8);
3262
    dspfunc(put_no_rnd, 1, 8);
3263
    dspfunc(put, 2, 4);
3264
    dspfunc(put, 3, 2);
3265

    
3266
    dspfunc(avg, 0, 16);
3267
    dspfunc(avg_no_rnd, 0, 16);
3268
    dspfunc(avg, 1, 8);
3269
    dspfunc(avg_no_rnd, 1, 8);
3270
    dspfunc(avg, 2, 4);
3271
    dspfunc(avg, 3, 2);
3272
#undef dspfunc
3273

    
3274
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3275
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3276

    
3277
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3278
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3279
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3280
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3281
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3282
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3283
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3284
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3285
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3286

    
3287
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3288
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3289
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3290
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3291
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3292
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3293
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3294
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3295
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3296

    
3297
#define dspfunc(PFX, IDX, NUM) \
3298
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3299
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3300
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3301
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3302
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3303
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3304
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3305
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3306
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3307
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3308
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3309
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3310
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3311
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3312
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3313
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3314

    
3315
    dspfunc(put_qpel, 0, 16);
3316
    dspfunc(put_no_rnd_qpel, 0, 16);
3317

    
3318
    dspfunc(avg_qpel, 0, 16);
3319
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3320

    
3321
    dspfunc(put_qpel, 1, 8);
3322
    dspfunc(put_no_rnd_qpel, 1, 8);
3323

    
3324
    dspfunc(avg_qpel, 1, 8);
3325
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3326

    
3327
    dspfunc(put_h264_qpel, 0, 16);
3328
    dspfunc(put_h264_qpel, 1, 8);
3329
    dspfunc(put_h264_qpel, 2, 4);
3330
    dspfunc(avg_h264_qpel, 0, 16);
3331
    dspfunc(avg_h264_qpel, 1, 8);
3332
    dspfunc(avg_h264_qpel, 2, 4);
3333

    
3334
#undef dspfunc
3335
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3336
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3337
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3338
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3339
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3340
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3341

    
3342
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3343
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3344
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3345
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3346
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3347
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3348
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3349
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3350
        
3351
#define SET_CMP_FUNC(name) \
3352
    c->name[0]= name ## 16_c;\
3353
    c->name[1]= name ## 8x8_c;
3354
    
3355
    SET_CMP_FUNC(hadamard8_diff)
3356
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3357
    SET_CMP_FUNC(dct_sad)
3358
    c->sad[0]= pix_abs16_c;
3359
    c->sad[1]= pix_abs8_c;
3360
    c->sse[0]= sse16_c;
3361
    c->sse[1]= sse8_c;
3362
    SET_CMP_FUNC(quant_psnr)
3363
    SET_CMP_FUNC(rd)
3364
    SET_CMP_FUNC(bit)
3365
    c->vsad[0]= vsad16_c;
3366
    c->vsad[4]= vsad_intra16_c;
3367
    c->vsse[0]= vsse16_c;
3368
    c->vsse[4]= vsse_intra16_c;
3369
    c->nsse[0]= nsse16_c;
3370
    c->nsse[1]= nsse8_c;
3371
        
3372
    c->add_bytes= add_bytes_c;
3373
    c->diff_bytes= diff_bytes_c;
3374
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3375
    c->bswap_buf= bswap_buf;
3376
    
3377
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3378
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3379
    
3380
    c->h261_loop_filter= h261_loop_filter_c;
3381
    
3382
    c->try_8x8basis= try_8x8basis_c;
3383
    c->add_8x8basis= add_8x8basis_c;
3384

    
3385
#ifdef HAVE_MMX
3386
    dsputil_init_mmx(c, avctx);
3387
#endif
3388
#ifdef ARCH_ARMV4L
3389
    dsputil_init_armv4l(c, avctx);
3390
#endif
3391
#ifdef HAVE_MLIB
3392
    dsputil_init_mlib(c, avctx);
3393
#endif
3394
#ifdef ARCH_SPARC
3395
   dsputil_init_vis(c,avctx);
3396
#endif
3397
#ifdef ARCH_ALPHA
3398
    dsputil_init_alpha(c, avctx);
3399
#endif
3400
#ifdef ARCH_POWERPC
3401
    dsputil_init_ppc(c, avctx);
3402
#endif
3403
#ifdef HAVE_MMI
3404
    dsputil_init_mmi(c, avctx);
3405
#endif
3406
#ifdef ARCH_SH4
3407
    dsputil_init_sh4(c,avctx);
3408
#endif
3409

    
3410
    switch(c->idct_permutation_type){
3411
    case FF_NO_IDCT_PERM:
3412
        for(i=0; i<64; i++)
3413
            c->idct_permutation[i]= i;
3414
        break;
3415
    case FF_LIBMPEG2_IDCT_PERM:
3416
        for(i=0; i<64; i++)
3417
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3418
        break;
3419
    case FF_SIMPLE_IDCT_PERM:
3420
        for(i=0; i<64; i++)
3421
            c->idct_permutation[i]= simple_mmx_permutation[i];
3422
        break;
3423
    case FF_TRANSPOSE_IDCT_PERM:
3424
        for(i=0; i<64; i++)
3425
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3426
        break;
3427
    default:
3428
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3429
    }
3430
}
3431