Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 8f2ab833

History | View | Annotate | Download (120 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22
 
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27
 
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35
uint32_t squareTbl[512];
36

    
37
const uint8_t ff_zigzag_direct[64] = {
38
    0,   1,  8, 16,  9,  2,  3, 10,
39
    17, 24, 32, 25, 18, 11,  4,  5,
40
    12, 19, 26, 33, 40, 48, 41, 34,
41
    27, 20, 13,  6,  7, 14, 21, 28,
42
    35, 42, 49, 56, 57, 50, 43, 36,
43
    29, 22, 15, 23, 30, 37, 44, 51,
44
    58, 59, 52, 45, 38, 31, 39, 46,
45
    53, 60, 61, 54, 47, 55, 62, 63
46
};
47

    
48
/* Specific zigzag scan for 248 idct. NOTE that unlike the
49
   specification, we interleave the fields */
50
const uint8_t ff_zigzag248_direct[64] = {
51
     0,  8,  1,  9, 16, 24,  2, 10,
52
    17, 25, 32, 40, 48, 56, 33, 41,
53
    18, 26,  3, 11,  4, 12, 19, 27,
54
    34, 42, 49, 57, 50, 58, 35, 43,
55
    20, 28,  5, 13,  6, 14, 21, 29,
56
    36, 44, 51, 59, 52, 60, 37, 45,
57
    22, 30,  7, 15, 23, 31, 38, 46,
58
    53, 61, 54, 62, 39, 47, 55, 63,
59
};
60

    
61
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62
uint16_t __align8 inv_zigzag_direct16[64];
63

    
64
const uint8_t ff_alternate_horizontal_scan[64] = {
65
    0,  1,   2,  3,  8,  9, 16, 17, 
66
    10, 11,  4,  5,  6,  7, 15, 14,
67
    13, 12, 19, 18, 24, 25, 32, 33, 
68
    26, 27, 20, 21, 22, 23, 28, 29,
69
    30, 31, 34, 35, 40, 41, 48, 49, 
70
    42, 43, 36, 37, 38, 39, 44, 45,
71
    46, 47, 50, 51, 56, 57, 58, 59, 
72
    52, 53, 54, 55, 60, 61, 62, 63,
73
};
74

    
75
const uint8_t ff_alternate_vertical_scan[64] = {
76
    0,  8,  16, 24,  1,  9,  2, 10, 
77
    17, 25, 32, 40, 48, 56, 57, 49,
78
    41, 33, 26, 18,  3, 11,  4, 12, 
79
    19, 27, 34, 42, 50, 58, 35, 43,
80
    51, 59, 20, 28,  5, 13,  6, 14, 
81
    21, 29, 36, 44, 52, 60, 37, 45,
82
    53, 61, 22, 30,  7, 15, 23, 31, 
83
    38, 46, 54, 62, 39, 47, 55, 63,
84
};
85

    
86
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87
const uint32_t inverse[256]={
88
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
89
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
90
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
91
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
92
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
93
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
94
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
95
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
96
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
97
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
98
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
99
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
100
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
101
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
102
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
103
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
104
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
105
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
106
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
107
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
108
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
109
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
110
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
111
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
112
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
113
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
114
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
115
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
116
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
117
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
118
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
119
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120
};
121

    
122
/* Input permutation for the simple_idct_mmx */
123
static const uint8_t simple_mmx_permutation[64]={
124
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
125
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
126
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
127
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
128
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
129
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
130
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
131
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132
};
133

    
134
static int pix_sum_c(uint8_t * pix, int line_size)
135
{
136
    int s, i, j;
137

    
138
    s = 0;
139
    for (i = 0; i < 16; i++) {
140
        for (j = 0; j < 16; j += 8) {
141
            s += pix[0];
142
            s += pix[1];
143
            s += pix[2];
144
            s += pix[3];
145
            s += pix[4];
146
            s += pix[5];
147
            s += pix[6];
148
            s += pix[7];
149
            pix += 8;
150
        }
151
        pix += line_size - 16;
152
    }
153
    return s;
154
}
155

    
156
static int pix_norm1_c(uint8_t * pix, int line_size)
157
{
158
    int s, i, j;
159
    uint32_t *sq = squareTbl + 256;
160

    
161
    s = 0;
162
    for (i = 0; i < 16; i++) {
163
        for (j = 0; j < 16; j += 8) {
164
#if 0
165
            s += sq[pix[0]];
166
            s += sq[pix[1]];
167
            s += sq[pix[2]];
168
            s += sq[pix[3]];
169
            s += sq[pix[4]];
170
            s += sq[pix[5]];
171
            s += sq[pix[6]];
172
            s += sq[pix[7]];
173
#else
174
#if LONG_MAX > 2147483647
175
            register uint64_t x=*(uint64_t*)pix;
176
            s += sq[x&0xff];
177
            s += sq[(x>>8)&0xff];
178
            s += sq[(x>>16)&0xff];
179
            s += sq[(x>>24)&0xff];
180
            s += sq[(x>>32)&0xff];
181
            s += sq[(x>>40)&0xff];
182
            s += sq[(x>>48)&0xff];
183
            s += sq[(x>>56)&0xff];
184
#else
185
            register uint32_t x=*(uint32_t*)pix;
186
            s += sq[x&0xff];
187
            s += sq[(x>>8)&0xff];
188
            s += sq[(x>>16)&0xff];
189
            s += sq[(x>>24)&0xff];
190
            x=*(uint32_t*)(pix+4);
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
#endif
196
#endif
197
            pix += 8;
198
        }
199
        pix += line_size - 16;
200
    }
201
    return s;
202
}
203

    
204
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205
    int i;
206
    
207
    for(i=0; i+8<=w; i+=8){
208
        dst[i+0]= bswap_32(src[i+0]);
209
        dst[i+1]= bswap_32(src[i+1]);
210
        dst[i+2]= bswap_32(src[i+2]);
211
        dst[i+3]= bswap_32(src[i+3]);
212
        dst[i+4]= bswap_32(src[i+4]);
213
        dst[i+5]= bswap_32(src[i+5]);
214
        dst[i+6]= bswap_32(src[i+6]);
215
        dst[i+7]= bswap_32(src[i+7]);
216
    }
217
    for(;i<w; i++){
218
        dst[i+0]= bswap_32(src[i+0]);
219
    }
220
}
221

    
222
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223
{
224
    int s, i;
225
    uint32_t *sq = squareTbl + 256;
226

    
227
    s = 0;
228
    for (i = 0; i < h; i++) {
229
        s += sq[pix1[0] - pix2[0]];
230
        s += sq[pix1[1] - pix2[1]];
231
        s += sq[pix1[2] - pix2[2]];
232
        s += sq[pix1[3] - pix2[3]];
233
        s += sq[pix1[4] - pix2[4]];
234
        s += sq[pix1[5] - pix2[5]];
235
        s += sq[pix1[6] - pix2[6]];
236
        s += sq[pix1[7] - pix2[7]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242

    
243
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[ 0] - pix2[ 0]];
251
        s += sq[pix1[ 1] - pix2[ 1]];
252
        s += sq[pix1[ 2] - pix2[ 2]];
253
        s += sq[pix1[ 3] - pix2[ 3]];
254
        s += sq[pix1[ 4] - pix2[ 4]];
255
        s += sq[pix1[ 5] - pix2[ 5]];
256
        s += sq[pix1[ 6] - pix2[ 6]];
257
        s += sq[pix1[ 7] - pix2[ 7]];
258
        s += sq[pix1[ 8] - pix2[ 8]];
259
        s += sq[pix1[ 9] - pix2[ 9]];
260
        s += sq[pix1[10] - pix2[10]];
261
        s += sq[pix1[11] - pix2[11]];
262
        s += sq[pix1[12] - pix2[12]];
263
        s += sq[pix1[13] - pix2[13]];
264
        s += sq[pix1[14] - pix2[14]];
265
        s += sq[pix1[15] - pix2[15]];
266

    
267
        pix1 += line_size;
268
        pix2 += line_size;
269
    }
270
    return s;
271
}
272

    
273
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274
{
275
    int i;
276

    
277
    /* read the pixels */
278
    for(i=0;i<8;i++) {
279
        block[0] = pixels[0];
280
        block[1] = pixels[1];
281
        block[2] = pixels[2];
282
        block[3] = pixels[3];
283
        block[4] = pixels[4];
284
        block[5] = pixels[5];
285
        block[6] = pixels[6];
286
        block[7] = pixels[7];
287
        pixels += line_size;
288
        block += 8;
289
    }
290
}
291

    
292
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293
                          const uint8_t *s2, int stride){
294
    int i;
295

    
296
    /* read the pixels */
297
    for(i=0;i<8;i++) {
298
        block[0] = s1[0] - s2[0];
299
        block[1] = s1[1] - s2[1];
300
        block[2] = s1[2] - s2[2];
301
        block[3] = s1[3] - s2[3];
302
        block[4] = s1[4] - s2[4];
303
        block[5] = s1[5] - s2[5];
304
        block[6] = s1[6] - s2[6];
305
        block[7] = s1[7] - s2[7];
306
        s1 += stride;
307
        s2 += stride;
308
        block += 8;
309
    }
310
}
311

    
312

    
313
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314
                                 int line_size)
315
{
316
    int i;
317
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
    
319
    /* read the pixels */
320
    for(i=0;i<8;i++) {
321
        pixels[0] = cm[block[0]];
322
        pixels[1] = cm[block[1]];
323
        pixels[2] = cm[block[2]];
324
        pixels[3] = cm[block[3]];
325
        pixels[4] = cm[block[4]];
326
        pixels[5] = cm[block[5]];
327
        pixels[6] = cm[block[6]];
328
        pixels[7] = cm[block[7]];
329

    
330
        pixels += line_size;
331
        block += 8;
332
    }
333
}
334

    
335
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
336
                          int line_size)
337
{
338
    int i;
339
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
340
    
341
    /* read the pixels */
342
    for(i=0;i<8;i++) {
343
        pixels[0] = cm[pixels[0] + block[0]];
344
        pixels[1] = cm[pixels[1] + block[1]];
345
        pixels[2] = cm[pixels[2] + block[2]];
346
        pixels[3] = cm[pixels[3] + block[3]];
347
        pixels[4] = cm[pixels[4] + block[4]];
348
        pixels[5] = cm[pixels[5] + block[5]];
349
        pixels[6] = cm[pixels[6] + block[6]];
350
        pixels[7] = cm[pixels[7] + block[7]];
351
        pixels += line_size;
352
        block += 8;
353
    }
354
}
355
#if 0
356

357
#define PIXOP2(OPNAME, OP) \
358
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
359
{\
360
    int i;\
361
    for(i=0; i<h; i++){\
362
        OP(*((uint64_t*)block), LD64(pixels));\
363
        pixels+=line_size;\
364
        block +=line_size;\
365
    }\
366
}\
367
\
368
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
369
{\
370
    int i;\
371
    for(i=0; i<h; i++){\
372
        const uint64_t a= LD64(pixels  );\
373
        const uint64_t b= LD64(pixels+1);\
374
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
375
        pixels+=line_size;\
376
        block +=line_size;\
377
    }\
378
}\
379
\
380
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
381
{\
382
    int i;\
383
    for(i=0; i<h; i++){\
384
        const uint64_t a= LD64(pixels  );\
385
        const uint64_t b= LD64(pixels+1);\
386
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
387
        pixels+=line_size;\
388
        block +=line_size;\
389
    }\
390
}\
391
\
392
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
393
{\
394
    int i;\
395
    for(i=0; i<h; i++){\
396
        const uint64_t a= LD64(pixels          );\
397
        const uint64_t b= LD64(pixels+line_size);\
398
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
399
        pixels+=line_size;\
400
        block +=line_size;\
401
    }\
402
}\
403
\
404
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405
{\
406
    int i;\
407
    for(i=0; i<h; i++){\
408
        const uint64_t a= LD64(pixels          );\
409
        const uint64_t b= LD64(pixels+line_size);\
410
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
411
        pixels+=line_size;\
412
        block +=line_size;\
413
    }\
414
}\
415
\
416
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
417
{\
418
        int i;\
419
        const uint64_t a= LD64(pixels  );\
420
        const uint64_t b= LD64(pixels+1);\
421
        uint64_t l0=  (a&0x0303030303030303ULL)\
422
                    + (b&0x0303030303030303ULL)\
423
                    + 0x0202020202020202ULL;\
424
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
425
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
426
        uint64_t l1,h1;\
427
\
428
        pixels+=line_size;\
429
        for(i=0; i<h; i+=2){\
430
            uint64_t a= LD64(pixels  );\
431
            uint64_t b= LD64(pixels+1);\
432
            l1=  (a&0x0303030303030303ULL)\
433
               + (b&0x0303030303030303ULL);\
434
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
435
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
436
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
437
            pixels+=line_size;\
438
            block +=line_size;\
439
            a= LD64(pixels  );\
440
            b= LD64(pixels+1);\
441
            l0=  (a&0x0303030303030303ULL)\
442
               + (b&0x0303030303030303ULL)\
443
               + 0x0202020202020202ULL;\
444
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
445
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
446
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
447
            pixels+=line_size;\
448
            block +=line_size;\
449
        }\
450
}\
451
\
452
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
453
{\
454
        int i;\
455
        const uint64_t a= LD64(pixels  );\
456
        const uint64_t b= LD64(pixels+1);\
457
        uint64_t l0=  (a&0x0303030303030303ULL)\
458
                    + (b&0x0303030303030303ULL)\
459
                    + 0x0101010101010101ULL;\
460
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
461
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
462
        uint64_t l1,h1;\
463
\
464
        pixels+=line_size;\
465
        for(i=0; i<h; i+=2){\
466
            uint64_t a= LD64(pixels  );\
467
            uint64_t b= LD64(pixels+1);\
468
            l1=  (a&0x0303030303030303ULL)\
469
               + (b&0x0303030303030303ULL);\
470
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
471
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
472
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
473
            pixels+=line_size;\
474
            block +=line_size;\
475
            a= LD64(pixels  );\
476
            b= LD64(pixels+1);\
477
            l0=  (a&0x0303030303030303ULL)\
478
               + (b&0x0303030303030303ULL)\
479
               + 0x0101010101010101ULL;\
480
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
481
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
482
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
483
            pixels+=line_size;\
484
            block +=line_size;\
485
        }\
486
}\
487
\
488
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
489
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
490
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
491
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
492
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
493
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
494
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
495

496
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
497
#else // 64 bit variant
498

    
499
#define PIXOP2(OPNAME, OP) \
500
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
501
    int i;\
502
    for(i=0; i<h; i++){\
503
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
504
        pixels+=line_size;\
505
        block +=line_size;\
506
    }\
507
}\
508
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
509
    int i;\
510
    for(i=0; i<h; i++){\
511
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
512
        pixels+=line_size;\
513
        block +=line_size;\
514
    }\
515
}\
516
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
517
    int i;\
518
    for(i=0; i<h; i++){\
519
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
520
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
521
        pixels+=line_size;\
522
        block +=line_size;\
523
    }\
524
}\
525
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
527
}\
528
\
529
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530
                                                int src_stride1, int src_stride2, int h){\
531
    int i;\
532
    for(i=0; i<h; i++){\
533
        uint32_t a,b;\
534
        a= LD32(&src1[i*src_stride1  ]);\
535
        b= LD32(&src2[i*src_stride2  ]);\
536
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
537
        a= LD32(&src1[i*src_stride1+4]);\
538
        b= LD32(&src2[i*src_stride2+4]);\
539
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
540
    }\
541
}\
542
\
543
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544
                                                int src_stride1, int src_stride2, int h){\
545
    int i;\
546
    for(i=0; i<h; i++){\
547
        uint32_t a,b;\
548
        a= LD32(&src1[i*src_stride1  ]);\
549
        b= LD32(&src2[i*src_stride2  ]);\
550
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
551
        a= LD32(&src1[i*src_stride1+4]);\
552
        b= LD32(&src2[i*src_stride2+4]);\
553
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
554
    }\
555
}\
556
\
557
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
558
                                                int src_stride1, int src_stride2, int h){\
559
    int i;\
560
    for(i=0; i<h; i++){\
561
        uint32_t a,b;\
562
        a= LD32(&src1[i*src_stride1  ]);\
563
        b= LD32(&src2[i*src_stride2  ]);\
564
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
565
    }\
566
}\
567
\
568
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
569
                                                int src_stride1, int src_stride2, int h){\
570
    int i;\
571
    for(i=0; i<h; i++){\
572
        uint32_t a,b;\
573
        a= LD16(&src1[i*src_stride1  ]);\
574
        b= LD16(&src2[i*src_stride2  ]);\
575
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
576
    }\
577
}\
578
\
579
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
580
                                                int src_stride1, int src_stride2, int h){\
581
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
582
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
583
}\
584
\
585
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
586
                                                int src_stride1, int src_stride2, int h){\
587
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
588
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
589
}\
590
\
591
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
592
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
593
}\
594
\
595
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
596
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
597
}\
598
\
599
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
600
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
601
}\
602
\
603
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
604
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
605
}\
606
\
607
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
608
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
609
    int i;\
610
    for(i=0; i<h; i++){\
611
        uint32_t a, b, c, d, l0, l1, h0, h1;\
612
        a= LD32(&src1[i*src_stride1]);\
613
        b= LD32(&src2[i*src_stride2]);\
614
        c= LD32(&src3[i*src_stride3]);\
615
        d= LD32(&src4[i*src_stride4]);\
616
        l0=  (a&0x03030303UL)\
617
           + (b&0x03030303UL)\
618
           + 0x02020202UL;\
619
        h0= ((a&0xFCFCFCFCUL)>>2)\
620
          + ((b&0xFCFCFCFCUL)>>2);\
621
        l1=  (c&0x03030303UL)\
622
           + (d&0x03030303UL);\
623
        h1= ((c&0xFCFCFCFCUL)>>2)\
624
          + ((d&0xFCFCFCFCUL)>>2);\
625
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626
        a= LD32(&src1[i*src_stride1+4]);\
627
        b= LD32(&src2[i*src_stride2+4]);\
628
        c= LD32(&src3[i*src_stride3+4]);\
629
        d= LD32(&src4[i*src_stride4+4]);\
630
        l0=  (a&0x03030303UL)\
631
           + (b&0x03030303UL)\
632
           + 0x02020202UL;\
633
        h0= ((a&0xFCFCFCFCUL)>>2)\
634
          + ((b&0xFCFCFCFCUL)>>2);\
635
        l1=  (c&0x03030303UL)\
636
           + (d&0x03030303UL);\
637
        h1= ((c&0xFCFCFCFCUL)>>2)\
638
          + ((d&0xFCFCFCFCUL)>>2);\
639
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
640
    }\
641
}\
642
\
643
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
645
}\
646
\
647
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
648
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
649
}\
650
\
651
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
653
}\
654
\
655
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
656
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
657
}\
658
\
659
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
660
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
661
    int i;\
662
    for(i=0; i<h; i++){\
663
        uint32_t a, b, c, d, l0, l1, h0, h1;\
664
        a= LD32(&src1[i*src_stride1]);\
665
        b= LD32(&src2[i*src_stride2]);\
666
        c= LD32(&src3[i*src_stride3]);\
667
        d= LD32(&src4[i*src_stride4]);\
668
        l0=  (a&0x03030303UL)\
669
           + (b&0x03030303UL)\
670
           + 0x01010101UL;\
671
        h0= ((a&0xFCFCFCFCUL)>>2)\
672
          + ((b&0xFCFCFCFCUL)>>2);\
673
        l1=  (c&0x03030303UL)\
674
           + (d&0x03030303UL);\
675
        h1= ((c&0xFCFCFCFCUL)>>2)\
676
          + ((d&0xFCFCFCFCUL)>>2);\
677
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678
        a= LD32(&src1[i*src_stride1+4]);\
679
        b= LD32(&src2[i*src_stride2+4]);\
680
        c= LD32(&src3[i*src_stride3+4]);\
681
        d= LD32(&src4[i*src_stride4+4]);\
682
        l0=  (a&0x03030303UL)\
683
           + (b&0x03030303UL)\
684
           + 0x01010101UL;\
685
        h0= ((a&0xFCFCFCFCUL)>>2)\
686
          + ((b&0xFCFCFCFCUL)>>2);\
687
        l1=  (c&0x03030303UL)\
688
           + (d&0x03030303UL);\
689
        h1= ((c&0xFCFCFCFCUL)>>2)\
690
          + ((d&0xFCFCFCFCUL)>>2);\
691
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
692
    }\
693
}\
694
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
695
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
696
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698
}\
699
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
700
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
701
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
703
}\
704
\
705
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706
{\
707
        int i, a0, b0, a1, b1;\
708
        a0= pixels[0];\
709
        b0= pixels[1] + 2;\
710
        a0 += b0;\
711
        b0 += pixels[2];\
712
\
713
        pixels+=line_size;\
714
        for(i=0; i<h; i+=2){\
715
            a1= pixels[0];\
716
            b1= pixels[1];\
717
            a1 += b1;\
718
            b1 += pixels[2];\
719
\
720
            block[0]= (a1+a0)>>2; /* FIXME non put */\
721
            block[1]= (b1+b0)>>2;\
722
\
723
            pixels+=line_size;\
724
            block +=line_size;\
725
\
726
            a0= pixels[0];\
727
            b0= pixels[1] + 2;\
728
            a0 += b0;\
729
            b0 += pixels[2];\
730
\
731
            block[0]= (a1+a0)>>2;\
732
            block[1]= (b1+b0)>>2;\
733
            pixels+=line_size;\
734
            block +=line_size;\
735
        }\
736
}\
737
\
738
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
739
{\
740
        int i;\
741
        const uint32_t a= LD32(pixels  );\
742
        const uint32_t b= LD32(pixels+1);\
743
        uint32_t l0=  (a&0x03030303UL)\
744
                    + (b&0x03030303UL)\
745
                    + 0x02020202UL;\
746
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
747
                   + ((b&0xFCFCFCFCUL)>>2);\
748
        uint32_t l1,h1;\
749
\
750
        pixels+=line_size;\
751
        for(i=0; i<h; i+=2){\
752
            uint32_t a= LD32(pixels  );\
753
            uint32_t b= LD32(pixels+1);\
754
            l1=  (a&0x03030303UL)\
755
               + (b&0x03030303UL);\
756
            h1= ((a&0xFCFCFCFCUL)>>2)\
757
              + ((b&0xFCFCFCFCUL)>>2);\
758
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
759
            pixels+=line_size;\
760
            block +=line_size;\
761
            a= LD32(pixels  );\
762
            b= LD32(pixels+1);\
763
            l0=  (a&0x03030303UL)\
764
               + (b&0x03030303UL)\
765
               + 0x02020202UL;\
766
            h0= ((a&0xFCFCFCFCUL)>>2)\
767
              + ((b&0xFCFCFCFCUL)>>2);\
768
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
769
            pixels+=line_size;\
770
            block +=line_size;\
771
        }\
772
}\
773
\
774
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
775
{\
776
    int j;\
777
    for(j=0; j<2; j++){\
778
        int i;\
779
        const uint32_t a= LD32(pixels  );\
780
        const uint32_t b= LD32(pixels+1);\
781
        uint32_t l0=  (a&0x03030303UL)\
782
                    + (b&0x03030303UL)\
783
                    + 0x02020202UL;\
784
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
785
                   + ((b&0xFCFCFCFCUL)>>2);\
786
        uint32_t l1,h1;\
787
\
788
        pixels+=line_size;\
789
        for(i=0; i<h; i+=2){\
790
            uint32_t a= LD32(pixels  );\
791
            uint32_t b= LD32(pixels+1);\
792
            l1=  (a&0x03030303UL)\
793
               + (b&0x03030303UL);\
794
            h1= ((a&0xFCFCFCFCUL)>>2)\
795
              + ((b&0xFCFCFCFCUL)>>2);\
796
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
797
            pixels+=line_size;\
798
            block +=line_size;\
799
            a= LD32(pixels  );\
800
            b= LD32(pixels+1);\
801
            l0=  (a&0x03030303UL)\
802
               + (b&0x03030303UL)\
803
               + 0x02020202UL;\
804
            h0= ((a&0xFCFCFCFCUL)>>2)\
805
              + ((b&0xFCFCFCFCUL)>>2);\
806
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
807
            pixels+=line_size;\
808
            block +=line_size;\
809
        }\
810
        pixels+=4-line_size*(h+1);\
811
        block +=4-line_size*h;\
812
    }\
813
}\
814
\
815
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
816
{\
817
    int j;\
818
    for(j=0; j<2; j++){\
819
        int i;\
820
        const uint32_t a= LD32(pixels  );\
821
        const uint32_t b= LD32(pixels+1);\
822
        uint32_t l0=  (a&0x03030303UL)\
823
                    + (b&0x03030303UL)\
824
                    + 0x01010101UL;\
825
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
826
                   + ((b&0xFCFCFCFCUL)>>2);\
827
        uint32_t l1,h1;\
828
\
829
        pixels+=line_size;\
830
        for(i=0; i<h; i+=2){\
831
            uint32_t a= LD32(pixels  );\
832
            uint32_t b= LD32(pixels+1);\
833
            l1=  (a&0x03030303UL)\
834
               + (b&0x03030303UL);\
835
            h1= ((a&0xFCFCFCFCUL)>>2)\
836
              + ((b&0xFCFCFCFCUL)>>2);\
837
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
838
            pixels+=line_size;\
839
            block +=line_size;\
840
            a= LD32(pixels  );\
841
            b= LD32(pixels+1);\
842
            l0=  (a&0x03030303UL)\
843
               + (b&0x03030303UL)\
844
               + 0x01010101UL;\
845
            h0= ((a&0xFCFCFCFCUL)>>2)\
846
              + ((b&0xFCFCFCFCUL)>>2);\
847
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
848
            pixels+=line_size;\
849
            block +=line_size;\
850
        }\
851
        pixels+=4-line_size*(h+1);\
852
        block +=4-line_size*h;\
853
    }\
854
}\
855
\
856
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
857
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
858
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
859
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
860
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
861
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
862
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
863
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
864

    
865
#define op_avg(a, b) a = rnd_avg32(a, b)
866
#endif
867
#define op_put(a, b) a = b
868

    
869
PIXOP2(avg, op_avg)
870
PIXOP2(put, op_put)
871
#undef op_avg
872
#undef op_put
873

    
874
#define avg2(a,b) ((a+b+1)>>1)
875
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
876

    
877

    
878
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
879
{
880
    const int A=(16-x16)*(16-y16);
881
    const int B=(   x16)*(16-y16);
882
    const int C=(16-x16)*(   y16);
883
    const int D=(   x16)*(   y16);
884
    int i;
885

    
886
    for(i=0; i<h; i++)
887
    {
888
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
889
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
890
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
891
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
892
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
893
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
894
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
895
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
896
        dst+= stride;
897
        src+= stride;
898
    }
899
}
900

    
901
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
902
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
903
{
904
    int y, vx, vy;
905
    const int s= 1<<shift;
906
    
907
    width--;
908
    height--;
909

    
910
    for(y=0; y<h; y++){
911
        int x;
912

    
913
        vx= ox;
914
        vy= oy;
915
        for(x=0; x<8; x++){ //XXX FIXME optimize
916
            int src_x, src_y, frac_x, frac_y, index;
917

    
918
            src_x= vx>>16;
919
            src_y= vy>>16;
920
            frac_x= src_x&(s-1);
921
            frac_y= src_y&(s-1);
922
            src_x>>=shift;
923
            src_y>>=shift;
924
  
925
            if((unsigned)src_x < width){
926
                if((unsigned)src_y < height){
927
                    index= src_x + src_y*stride;
928
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
929
                                           + src[index       +1]*   frac_x )*(s-frac_y)
930
                                        + (  src[index+stride  ]*(s-frac_x)
931
                                           + src[index+stride+1]*   frac_x )*   frac_y
932
                                        + r)>>(shift*2);
933
                }else{
934
                    index= src_x + clip(src_y, 0, height)*stride;                    
935
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
936
                                          + src[index       +1]*   frac_x )*s
937
                                        + r)>>(shift*2);
938
                }
939
            }else{
940
                if((unsigned)src_y < height){
941
                    index= clip(src_x, 0, width) + src_y*stride;                    
942
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
943
                                           + src[index+stride  ]*   frac_y )*s
944
                                        + r)>>(shift*2);
945
                }else{
946
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
947
                    dst[y*stride + x]=    src[index         ];
948
                }
949
            }
950
            
951
            vx+= dxx;
952
            vy+= dyx;
953
        }
954
        ox += dxy;
955
        oy += dyy;
956
    }
957
}
958

    
959
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
960
    switch(width){
961
    case 2: put_pixels2_c (dst, src, stride, height); break;
962
    case 4: put_pixels4_c (dst, src, stride, height); break;
963
    case 8: put_pixels8_c (dst, src, stride, height); break;
964
    case 16:put_pixels16_c(dst, src, stride, height); break;
965
    }
966
}
967

    
968
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
969
    int i,j;
970
    for (i=0; i < height; i++) {
971
      for (j=0; j < width; j++) {
972
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
973
      }
974
      src += stride;
975
      dst += stride;
976
    }
977
}
978

    
979
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
980
    int i,j;
981
    for (i=0; i < height; i++) {
982
      for (j=0; j < width; j++) {
983
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
984
      }
985
      src += stride;
986
      dst += stride;
987
    }
988
}
989
    
990
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
991
    int i,j;
992
    for (i=0; i < height; i++) {
993
      for (j=0; j < width; j++) {
994
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
995
      }
996
      src += stride;
997
      dst += stride;
998
    }
999
}
1000
    
1001
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1002
    int i,j;
1003
    for (i=0; i < height; i++) {
1004
      for (j=0; j < width; j++) {
1005
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1006
      }
1007
      src += stride;
1008
      dst += stride;
1009
    }
1010
}
1011

    
1012
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1013
    int i,j;
1014
    for (i=0; i < height; i++) {
1015
      for (j=0; j < width; j++) {
1016
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1017
      }
1018
      src += stride;
1019
      dst += stride;
1020
    }
1021
}
1022

    
1023
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1024
    int i,j;
1025
    for (i=0; i < height; i++) {
1026
      for (j=0; j < width; j++) {
1027
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1028
      }
1029
      src += stride;
1030
      dst += stride;
1031
    }
1032
}
1033

    
1034
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1035
    int i,j;
1036
    for (i=0; i < height; i++) {
1037
      for (j=0; j < width; j++) {
1038
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1039
      }
1040
      src += stride;
1041
      dst += stride;
1042
    }
1043
}
1044

    
1045
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1046
    int i,j;
1047
    for (i=0; i < height; i++) {
1048
      for (j=0; j < width; j++) {
1049
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1050
      }
1051
      src += stride;
1052
      dst += stride;
1053
    }
1054
}
1055

    
1056
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1057
    switch(width){
1058
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1059
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1060
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1061
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1062
    }
1063
}
1064

    
1065
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1066
    int i,j;
1067
    for (i=0; i < height; i++) {
1068
      for (j=0; j < width; j++) {
1069
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1070
      }
1071
      src += stride;
1072
      dst += stride;
1073
    }
1074
}
1075

    
1076
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1077
    int i,j;
1078
    for (i=0; i < height; i++) {
1079
      for (j=0; j < width; j++) {
1080
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1081
      }
1082
      src += stride;
1083
      dst += stride;
1084
    }
1085
}
1086
    
1087
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1088
    int i,j;
1089
    for (i=0; i < height; i++) {
1090
      for (j=0; j < width; j++) {
1091
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1092
      }
1093
      src += stride;
1094
      dst += stride;
1095
    }
1096
}
1097
    
1098
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1099
    int i,j;
1100
    for (i=0; i < height; i++) {
1101
      for (j=0; j < width; j++) {
1102
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1103
      }
1104
      src += stride;
1105
      dst += stride;
1106
    }
1107
}
1108

    
1109
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1110
    int i,j;
1111
    for (i=0; i < height; i++) {
1112
      for (j=0; j < width; j++) {
1113
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1114
      }
1115
      src += stride;
1116
      dst += stride;
1117
    }
1118
}
1119

    
1120
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1121
    int i,j;
1122
    for (i=0; i < height; i++) {
1123
      for (j=0; j < width; j++) {
1124
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1125
      }
1126
      src += stride;
1127
      dst += stride;
1128
    }
1129
}
1130

    
1131
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1132
    int i,j;
1133
    for (i=0; i < height; i++) {
1134
      for (j=0; j < width; j++) {
1135
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1136
      }
1137
      src += stride;
1138
      dst += stride;
1139
    }
1140
}
1141

    
1142
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1143
    int i,j;
1144
    for (i=0; i < height; i++) {
1145
      for (j=0; j < width; j++) {
1146
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1147
      }
1148
      src += stride;
1149
      dst += stride;
1150
    }
1151
}
1152
#if 0
1153
#define TPEL_WIDTH(width)\
1154
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1156
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1158
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1159
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1160
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1161
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1162
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1163
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1164
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1165
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1166
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1167
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1168
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1169
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1170
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1171
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1172
#endif
1173

    
1174
#define H264_CHROMA_MC(OPNAME, OP)\
1175
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1176
    const int A=(8-x)*(8-y);\
1177
    const int B=(  x)*(8-y);\
1178
    const int C=(8-x)*(  y);\
1179
    const int D=(  x)*(  y);\
1180
    int i;\
1181
    \
1182
    assert(x<8 && y<8 && x>=0 && y>=0);\
1183
\
1184
    for(i=0; i<h; i++)\
1185
    {\
1186
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1187
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1188
        dst+= stride;\
1189
        src+= stride;\
1190
    }\
1191
}\
1192
\
1193
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1194
    const int A=(8-x)*(8-y);\
1195
    const int B=(  x)*(8-y);\
1196
    const int C=(8-x)*(  y);\
1197
    const int D=(  x)*(  y);\
1198
    int i;\
1199
    \
1200
    assert(x<8 && y<8 && x>=0 && y>=0);\
1201
\
1202
    for(i=0; i<h; i++)\
1203
    {\
1204
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1205
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1206
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1207
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1208
        dst+= stride;\
1209
        src+= stride;\
1210
    }\
1211
}\
1212
\
1213
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1214
    const int A=(8-x)*(8-y);\
1215
    const int B=(  x)*(8-y);\
1216
    const int C=(8-x)*(  y);\
1217
    const int D=(  x)*(  y);\
1218
    int i;\
1219
    \
1220
    assert(x<8 && y<8 && x>=0 && y>=0);\
1221
\
1222
    for(i=0; i<h; i++)\
1223
    {\
1224
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1225
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1226
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1227
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1228
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1229
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1230
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1231
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1232
        dst+= stride;\
1233
        src+= stride;\
1234
    }\
1235
}
1236

    
1237
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1238
#define op_put(a, b) a = (((b) + 32)>>6)
1239

    
1240
H264_CHROMA_MC(put_       , op_put)
1241
H264_CHROMA_MC(avg_       , op_avg)
1242
#undef op_avg
1243
#undef op_put
1244

    
1245
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1246
{
1247
    int i;
1248
    for(i=0; i<h; i++)
1249
    {
1250
        ST32(dst   , LD32(src   ));
1251
        dst+=dstStride;
1252
        src+=srcStride;
1253
    }
1254
}
1255

    
1256
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1257
{
1258
    int i;
1259
    for(i=0; i<h; i++)
1260
    {
1261
        ST32(dst   , LD32(src   ));
1262
        ST32(dst+4 , LD32(src+4 ));
1263
        dst+=dstStride;
1264
        src+=srcStride;
1265
    }
1266
}
1267

    
1268
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1269
{
1270
    int i;
1271
    for(i=0; i<h; i++)
1272
    {
1273
        ST32(dst   , LD32(src   ));
1274
        ST32(dst+4 , LD32(src+4 ));
1275
        ST32(dst+8 , LD32(src+8 ));
1276
        ST32(dst+12, LD32(src+12));
1277
        dst+=dstStride;
1278
        src+=srcStride;
1279
    }
1280
}
1281

    
1282
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1283
{
1284
    int i;
1285
    for(i=0; i<h; i++)
1286
    {
1287
        ST32(dst   , LD32(src   ));
1288
        ST32(dst+4 , LD32(src+4 ));
1289
        ST32(dst+8 , LD32(src+8 ));
1290
        ST32(dst+12, LD32(src+12));
1291
        dst[16]= src[16];
1292
        dst+=dstStride;
1293
        src+=srcStride;
1294
    }
1295
}
1296

    
1297
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1298
{
1299
    int i;
1300
    for(i=0; i<h; i++)
1301
    {
1302
        ST32(dst   , LD32(src   ));
1303
        ST32(dst+4 , LD32(src+4 ));
1304
        dst[8]= src[8];
1305
        dst+=dstStride;
1306
        src+=srcStride;
1307
    }
1308
}
1309

    
1310

    
1311
#define QPEL_MC(r, OPNAME, RND, OP) \
1312
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1313
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1314
    int i;\
1315
    for(i=0; i<h; i++)\
1316
    {\
1317
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1318
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1319
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1320
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1321
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1322
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1323
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1324
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1325
        dst+=dstStride;\
1326
        src+=srcStride;\
1327
    }\
1328
}\
1329
\
1330
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1331
    const int w=8;\
1332
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1333
    int i;\
1334
    for(i=0; i<w; i++)\
1335
    {\
1336
        const int src0= src[0*srcStride];\
1337
        const int src1= src[1*srcStride];\
1338
        const int src2= src[2*srcStride];\
1339
        const int src3= src[3*srcStride];\
1340
        const int src4= src[4*srcStride];\
1341
        const int src5= src[5*srcStride];\
1342
        const int src6= src[6*srcStride];\
1343
        const int src7= src[7*srcStride];\
1344
        const int src8= src[8*srcStride];\
1345
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1346
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1347
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1348
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1349
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1350
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1351
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1352
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1353
        dst++;\
1354
        src++;\
1355
    }\
1356
}\
1357
\
1358
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1359
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1360
    int i;\
1361
    \
1362
    for(i=0; i<h; i++)\
1363
    {\
1364
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1365
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1366
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1367
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1368
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1369
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1370
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1371
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1372
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1373
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1374
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1375
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1376
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1377
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1378
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1379
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1380
        dst+=dstStride;\
1381
        src+=srcStride;\
1382
    }\
1383
}\
1384
\
1385
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1386
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1387
    int i;\
1388
    const int w=16;\
1389
    for(i=0; i<w; i++)\
1390
    {\
1391
        const int src0= src[0*srcStride];\
1392
        const int src1= src[1*srcStride];\
1393
        const int src2= src[2*srcStride];\
1394
        const int src3= src[3*srcStride];\
1395
        const int src4= src[4*srcStride];\
1396
        const int src5= src[5*srcStride];\
1397
        const int src6= src[6*srcStride];\
1398
        const int src7= src[7*srcStride];\
1399
        const int src8= src[8*srcStride];\
1400
        const int src9= src[9*srcStride];\
1401
        const int src10= src[10*srcStride];\
1402
        const int src11= src[11*srcStride];\
1403
        const int src12= src[12*srcStride];\
1404
        const int src13= src[13*srcStride];\
1405
        const int src14= src[14*srcStride];\
1406
        const int src15= src[15*srcStride];\
1407
        const int src16= src[16*srcStride];\
1408
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1409
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1410
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1411
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1412
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1413
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1414
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1415
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1416
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1417
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1418
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1419
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1420
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1421
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1422
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1423
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1424
        dst++;\
1425
        src++;\
1426
    }\
1427
}\
1428
\
1429
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1430
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1431
}\
1432
\
1433
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1434
    uint8_t half[64];\
1435
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1436
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1437
}\
1438
\
1439
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1440
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1441
}\
1442
\
1443
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1444
    uint8_t half[64];\
1445
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1446
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1447
}\
1448
\
1449
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1450
    uint8_t full[16*9];\
1451
    uint8_t half[64];\
1452
    copy_block9(full, src, 16, stride, 9);\
1453
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1455
}\
1456
\
1457
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1458
    uint8_t full[16*9];\
1459
    copy_block9(full, src, 16, stride, 9);\
1460
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1461
}\
1462
\
1463
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1464
    uint8_t full[16*9];\
1465
    uint8_t half[64];\
1466
    copy_block9(full, src, 16, stride, 9);\
1467
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1468
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1469
}\
1470
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1471
    uint8_t full[16*9];\
1472
    uint8_t halfH[72];\
1473
    uint8_t halfV[64];\
1474
    uint8_t halfHV[64];\
1475
    copy_block9(full, src, 16, stride, 9);\
1476
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1477
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1478
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1479
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1480
}\
1481
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1482
    uint8_t full[16*9];\
1483
    uint8_t halfH[72];\
1484
    uint8_t halfHV[64];\
1485
    copy_block9(full, src, 16, stride, 9);\
1486
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1487
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1488
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1489
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1490
}\
1491
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1492
    uint8_t full[16*9];\
1493
    uint8_t halfH[72];\
1494
    uint8_t halfV[64];\
1495
    uint8_t halfHV[64];\
1496
    copy_block9(full, src, 16, stride, 9);\
1497
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1498
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1499
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1500
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1501
}\
1502
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1503
    uint8_t full[16*9];\
1504
    uint8_t halfH[72];\
1505
    uint8_t halfHV[64];\
1506
    copy_block9(full, src, 16, stride, 9);\
1507
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1508
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1509
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1510
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1511
}\
1512
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1513
    uint8_t full[16*9];\
1514
    uint8_t halfH[72];\
1515
    uint8_t halfV[64];\
1516
    uint8_t halfHV[64];\
1517
    copy_block9(full, src, 16, stride, 9);\
1518
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1519
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1520
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1521
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1522
}\
1523
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1524
    uint8_t full[16*9];\
1525
    uint8_t halfH[72];\
1526
    uint8_t halfHV[64];\
1527
    copy_block9(full, src, 16, stride, 9);\
1528
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1529
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1530
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1531
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1532
}\
1533
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1534
    uint8_t full[16*9];\
1535
    uint8_t halfH[72];\
1536
    uint8_t halfV[64];\
1537
    uint8_t halfHV[64];\
1538
    copy_block9(full, src, 16, stride, 9);\
1539
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1540
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1541
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1542
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1543
}\
1544
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1545
    uint8_t full[16*9];\
1546
    uint8_t halfH[72];\
1547
    uint8_t halfHV[64];\
1548
    copy_block9(full, src, 16, stride, 9);\
1549
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1550
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1551
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1553
}\
1554
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1555
    uint8_t halfH[72];\
1556
    uint8_t halfHV[64];\
1557
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1558
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1560
}\
1561
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1562
    uint8_t halfH[72];\
1563
    uint8_t halfHV[64];\
1564
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1567
}\
1568
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1569
    uint8_t full[16*9];\
1570
    uint8_t halfH[72];\
1571
    uint8_t halfV[64];\
1572
    uint8_t halfHV[64];\
1573
    copy_block9(full, src, 16, stride, 9);\
1574
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1575
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1576
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1577
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1578
}\
1579
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1580
    uint8_t full[16*9];\
1581
    uint8_t halfH[72];\
1582
    copy_block9(full, src, 16, stride, 9);\
1583
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1584
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1585
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1586
}\
1587
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1588
    uint8_t full[16*9];\
1589
    uint8_t halfH[72];\
1590
    uint8_t halfV[64];\
1591
    uint8_t halfHV[64];\
1592
    copy_block9(full, src, 16, stride, 9);\
1593
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1594
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1595
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1596
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1597
}\
1598
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1599
    uint8_t full[16*9];\
1600
    uint8_t halfH[72];\
1601
    copy_block9(full, src, 16, stride, 9);\
1602
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1604
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1605
}\
1606
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1607
    uint8_t halfH[72];\
1608
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1609
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1610
}\
1611
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1612
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1613
}\
1614
\
1615
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1616
    uint8_t half[256];\
1617
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1618
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1619
}\
1620
\
1621
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1622
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1623
}\
1624
\
1625
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1626
    uint8_t half[256];\
1627
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1628
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1629
}\
1630
\
1631
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1632
    uint8_t full[24*17];\
1633
    uint8_t half[256];\
1634
    copy_block17(full, src, 24, stride, 17);\
1635
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1637
}\
1638
\
1639
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1640
    uint8_t full[24*17];\
1641
    copy_block17(full, src, 24, stride, 17);\
1642
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1643
}\
1644
\
1645
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1646
    uint8_t full[24*17];\
1647
    uint8_t half[256];\
1648
    copy_block17(full, src, 24, stride, 17);\
1649
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1650
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1651
}\
1652
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1653
    uint8_t full[24*17];\
1654
    uint8_t halfH[272];\
1655
    uint8_t halfV[256];\
1656
    uint8_t halfHV[256];\
1657
    copy_block17(full, src, 24, stride, 17);\
1658
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1659
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1660
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1661
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1662
}\
1663
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1664
    uint8_t full[24*17];\
1665
    uint8_t halfH[272];\
1666
    uint8_t halfHV[256];\
1667
    copy_block17(full, src, 24, stride, 17);\
1668
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1669
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1670
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1671
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1672
}\
1673
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1674
    uint8_t full[24*17];\
1675
    uint8_t halfH[272];\
1676
    uint8_t halfV[256];\
1677
    uint8_t halfHV[256];\
1678
    copy_block17(full, src, 24, stride, 17);\
1679
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1680
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1681
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1682
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1683
}\
1684
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1685
    uint8_t full[24*17];\
1686
    uint8_t halfH[272];\
1687
    uint8_t halfHV[256];\
1688
    copy_block17(full, src, 24, stride, 17);\
1689
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1690
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1691
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1692
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1693
}\
1694
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1695
    uint8_t full[24*17];\
1696
    uint8_t halfH[272];\
1697
    uint8_t halfV[256];\
1698
    uint8_t halfHV[256];\
1699
    copy_block17(full, src, 24, stride, 17);\
1700
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1701
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1702
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1703
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1704
}\
1705
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1706
    uint8_t full[24*17];\
1707
    uint8_t halfH[272];\
1708
    uint8_t halfHV[256];\
1709
    copy_block17(full, src, 24, stride, 17);\
1710
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1711
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1712
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1713
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1714
}\
1715
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716
    uint8_t full[24*17];\
1717
    uint8_t halfH[272];\
1718
    uint8_t halfV[256];\
1719
    uint8_t halfHV[256];\
1720
    copy_block17(full, src, 24, stride, 17);\
1721
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1722
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1723
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1724
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1725
}\
1726
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1727
    uint8_t full[24*17];\
1728
    uint8_t halfH[272];\
1729
    uint8_t halfHV[256];\
1730
    copy_block17(full, src, 24, stride, 17);\
1731
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1732
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1733
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1735
}\
1736
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1737
    uint8_t halfH[272];\
1738
    uint8_t halfHV[256];\
1739
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1740
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1742
}\
1743
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1744
    uint8_t halfH[272];\
1745
    uint8_t halfHV[256];\
1746
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1749
}\
1750
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1751
    uint8_t full[24*17];\
1752
    uint8_t halfH[272];\
1753
    uint8_t halfV[256];\
1754
    uint8_t halfHV[256];\
1755
    copy_block17(full, src, 24, stride, 17);\
1756
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1757
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1758
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1759
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1760
}\
1761
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1762
    uint8_t full[24*17];\
1763
    uint8_t halfH[272];\
1764
    copy_block17(full, src, 24, stride, 17);\
1765
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1766
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1767
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1768
}\
1769
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770
    uint8_t full[24*17];\
1771
    uint8_t halfH[272];\
1772
    uint8_t halfV[256];\
1773
    uint8_t halfHV[256];\
1774
    copy_block17(full, src, 24, stride, 17);\
1775
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1776
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1777
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1778
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1779
}\
1780
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1781
    uint8_t full[24*17];\
1782
    uint8_t halfH[272];\
1783
    copy_block17(full, src, 24, stride, 17);\
1784
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1786
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1787
}\
1788
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1789
    uint8_t halfH[272];\
1790
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1791
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1792
}
1793

    
1794
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1795
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1796
#define op_put(a, b) a = cm[((b) + 16)>>5]
1797
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1798

    
1799
QPEL_MC(0, put_       , _       , op_put)
1800
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1801
QPEL_MC(0, avg_       , _       , op_avg)
1802
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1803
#undef op_avg
1804
#undef op_avg_no_rnd
1805
#undef op_put
1806
#undef op_put_no_rnd
1807

    
1808
#if 1
1809
#define H264_LOWPASS(OPNAME, OP, OP2) \
1810
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1811
    const int h=4;\
1812
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1813
    int i;\
1814
    for(i=0; i<h; i++)\
1815
    {\
1816
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1817
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1818
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1819
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1820
        dst+=dstStride;\
1821
        src+=srcStride;\
1822
    }\
1823
}\
1824
\
1825
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1826
    const int w=4;\
1827
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1828
    int i;\
1829
    for(i=0; i<w; i++)\
1830
    {\
1831
        const int srcB= src[-2*srcStride];\
1832
        const int srcA= src[-1*srcStride];\
1833
        const int src0= src[0 *srcStride];\
1834
        const int src1= src[1 *srcStride];\
1835
        const int src2= src[2 *srcStride];\
1836
        const int src3= src[3 *srcStride];\
1837
        const int src4= src[4 *srcStride];\
1838
        const int src5= src[5 *srcStride];\
1839
        const int src6= src[6 *srcStride];\
1840
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1841
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1842
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1843
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1844
        dst++;\
1845
        src++;\
1846
    }\
1847
}\
1848
\
1849
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1850
    const int h=4;\
1851
    const int w=4;\
1852
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1853
    int i;\
1854
    src -= 2*srcStride;\
1855
    for(i=0; i<h+5; i++)\
1856
    {\
1857
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1858
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1859
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1860
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1861
        tmp+=tmpStride;\
1862
        src+=srcStride;\
1863
    }\
1864
    tmp -= tmpStride*(h+5-2);\
1865
    for(i=0; i<w; i++)\
1866
    {\
1867
        const int tmpB= tmp[-2*tmpStride];\
1868
        const int tmpA= tmp[-1*tmpStride];\
1869
        const int tmp0= tmp[0 *tmpStride];\
1870
        const int tmp1= tmp[1 *tmpStride];\
1871
        const int tmp2= tmp[2 *tmpStride];\
1872
        const int tmp3= tmp[3 *tmpStride];\
1873
        const int tmp4= tmp[4 *tmpStride];\
1874
        const int tmp5= tmp[5 *tmpStride];\
1875
        const int tmp6= tmp[6 *tmpStride];\
1876
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1877
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1878
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1879
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1880
        dst++;\
1881
        tmp++;\
1882
    }\
1883
}\
1884
\
1885
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1886
    const int h=8;\
1887
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1888
    int i;\
1889
    for(i=0; i<h; i++)\
1890
    {\
1891
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1892
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1893
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1894
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1895
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1896
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1897
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1898
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1899
        dst+=dstStride;\
1900
        src+=srcStride;\
1901
    }\
1902
}\
1903
\
1904
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1905
    const int w=8;\
1906
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1907
    int i;\
1908
    for(i=0; i<w; i++)\
1909
    {\
1910
        const int srcB= src[-2*srcStride];\
1911
        const int srcA= src[-1*srcStride];\
1912
        const int src0= src[0 *srcStride];\
1913
        const int src1= src[1 *srcStride];\
1914
        const int src2= src[2 *srcStride];\
1915
        const int src3= src[3 *srcStride];\
1916
        const int src4= src[4 *srcStride];\
1917
        const int src5= src[5 *srcStride];\
1918
        const int src6= src[6 *srcStride];\
1919
        const int src7= src[7 *srcStride];\
1920
        const int src8= src[8 *srcStride];\
1921
        const int src9= src[9 *srcStride];\
1922
        const int src10=src[10*srcStride];\
1923
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1924
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1925
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1926
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1927
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1928
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1929
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1930
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1931
        dst++;\
1932
        src++;\
1933
    }\
1934
}\
1935
\
1936
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1937
    const int h=8;\
1938
    const int w=8;\
1939
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1940
    int i;\
1941
    src -= 2*srcStride;\
1942
    for(i=0; i<h+5; i++)\
1943
    {\
1944
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1945
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1946
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1947
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1948
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1949
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1950
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1951
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1952
        tmp+=tmpStride;\
1953
        src+=srcStride;\
1954
    }\
1955
    tmp -= tmpStride*(h+5-2);\
1956
    for(i=0; i<w; i++)\
1957
    {\
1958
        const int tmpB= tmp[-2*tmpStride];\
1959
        const int tmpA= tmp[-1*tmpStride];\
1960
        const int tmp0= tmp[0 *tmpStride];\
1961
        const int tmp1= tmp[1 *tmpStride];\
1962
        const int tmp2= tmp[2 *tmpStride];\
1963
        const int tmp3= tmp[3 *tmpStride];\
1964
        const int tmp4= tmp[4 *tmpStride];\
1965
        const int tmp5= tmp[5 *tmpStride];\
1966
        const int tmp6= tmp[6 *tmpStride];\
1967
        const int tmp7= tmp[7 *tmpStride];\
1968
        const int tmp8= tmp[8 *tmpStride];\
1969
        const int tmp9= tmp[9 *tmpStride];\
1970
        const int tmp10=tmp[10*tmpStride];\
1971
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1972
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1973
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1974
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1975
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1976
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1977
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1978
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1979
        dst++;\
1980
        tmp++;\
1981
    }\
1982
}\
1983
\
1984
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1985
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1986
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1987
    src += 8*srcStride;\
1988
    dst += 8*dstStride;\
1989
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1990
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1991
}\
1992
\
1993
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1994
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1995
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1996
    src += 8*srcStride;\
1997
    dst += 8*dstStride;\
1998
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1999
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2000
}\
2001
\
2002
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2003
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2004
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2005
    src += 8*srcStride;\
2006
    tmp += 8*tmpStride;\
2007
    dst += 8*dstStride;\
2008
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2009
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2010
}\
2011

    
2012
#define H264_MC(OPNAME, SIZE) \
2013
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2014
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2015
}\
2016
\
2017
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2018
    uint8_t half[SIZE*SIZE];\
2019
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2020
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2021
}\
2022
\
2023
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2024
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2025
}\
2026
\
2027
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2028
    uint8_t half[SIZE*SIZE];\
2029
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2030
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2031
}\
2032
\
2033
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2034
    uint8_t full[SIZE*(SIZE+5)];\
2035
    uint8_t * const full_mid= full + SIZE*2;\
2036
    uint8_t half[SIZE*SIZE];\
2037
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2038
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2039
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2040
}\
2041
\
2042
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2043
    uint8_t full[SIZE*(SIZE+5)];\
2044
    uint8_t * const full_mid= full + SIZE*2;\
2045
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2046
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2047
}\
2048
\
2049
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2050
    uint8_t full[SIZE*(SIZE+5)];\
2051
    uint8_t * const full_mid= full + SIZE*2;\
2052
    uint8_t half[SIZE*SIZE];\
2053
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2054
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2055
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2056
}\
2057
\
2058
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059
    uint8_t full[SIZE*(SIZE+5)];\
2060
    uint8_t * const full_mid= full + SIZE*2;\
2061
    uint8_t halfH[SIZE*SIZE];\
2062
    uint8_t halfV[SIZE*SIZE];\
2063
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2064
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2065
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2066
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2067
}\
2068
\
2069
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2070
    uint8_t full[SIZE*(SIZE+5)];\
2071
    uint8_t * const full_mid= full + SIZE*2;\
2072
    uint8_t halfH[SIZE*SIZE];\
2073
    uint8_t halfV[SIZE*SIZE];\
2074
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2075
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2076
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2077
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2078
}\
2079
\
2080
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2081
    uint8_t full[SIZE*(SIZE+5)];\
2082
    uint8_t * const full_mid= full + SIZE*2;\
2083
    uint8_t halfH[SIZE*SIZE];\
2084
    uint8_t halfV[SIZE*SIZE];\
2085
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2086
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2087
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2088
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2089
}\
2090
\
2091
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2092
    uint8_t full[SIZE*(SIZE+5)];\
2093
    uint8_t * const full_mid= full + SIZE*2;\
2094
    uint8_t halfH[SIZE*SIZE];\
2095
    uint8_t halfV[SIZE*SIZE];\
2096
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2097
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2098
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2099
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2100
}\
2101
\
2102
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2103
    int16_t tmp[SIZE*(SIZE+5)];\
2104
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2105
}\
2106
\
2107
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2108
    int16_t tmp[SIZE*(SIZE+5)];\
2109
    uint8_t halfH[SIZE*SIZE];\
2110
    uint8_t halfHV[SIZE*SIZE];\
2111
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2112
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2113
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2114
}\
2115
\
2116
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2117
    int16_t tmp[SIZE*(SIZE+5)];\
2118
    uint8_t halfH[SIZE*SIZE];\
2119
    uint8_t halfHV[SIZE*SIZE];\
2120
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2121
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2122
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2123
}\
2124
\
2125
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2126
    uint8_t full[SIZE*(SIZE+5)];\
2127
    uint8_t * const full_mid= full + SIZE*2;\
2128
    int16_t tmp[SIZE*(SIZE+5)];\
2129
    uint8_t halfV[SIZE*SIZE];\
2130
    uint8_t halfHV[SIZE*SIZE];\
2131
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2132
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2133
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2134
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2135
}\
2136
\
2137
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t full[SIZE*(SIZE+5)];\
2139
    uint8_t * const full_mid= full + SIZE*2;\
2140
    int16_t tmp[SIZE*(SIZE+5)];\
2141
    uint8_t halfV[SIZE*SIZE];\
2142
    uint8_t halfHV[SIZE*SIZE];\
2143
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2144
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2145
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2146
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2147
}\
2148

    
2149
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2150
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2151
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2152
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2153
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2154

    
2155
H264_LOWPASS(put_       , op_put, op2_put)
2156
H264_LOWPASS(avg_       , op_avg, op2_avg)
2157
H264_MC(put_, 4)
2158
H264_MC(put_, 8)
2159
H264_MC(put_, 16)
2160
H264_MC(avg_, 4)
2161
H264_MC(avg_, 8)
2162
H264_MC(avg_, 16)
2163

    
2164
#undef op_avg
2165
#undef op_put
2166
#undef op2_avg
2167
#undef op2_put
2168
#endif
2169

    
2170
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2171
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2172
    int i;
2173

    
2174
    for(i=0; i<h; i++){
2175
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2176
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2177
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2178
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2179
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2180
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2181
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2182
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2183
        dst+=dstStride;
2184
        src+=srcStride;        
2185
    }
2186
}
2187

    
2188
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2189
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2190
    int i;
2191

    
2192
    for(i=0; i<w; i++){
2193
        const int src_1= src[ -srcStride];
2194
        const int src0 = src[0          ];
2195
        const int src1 = src[  srcStride];
2196
        const int src2 = src[2*srcStride];
2197
        const int src3 = src[3*srcStride];
2198
        const int src4 = src[4*srcStride];
2199
        const int src5 = src[5*srcStride];
2200
        const int src6 = src[6*srcStride];
2201
        const int src7 = src[7*srcStride];
2202
        const int src8 = src[8*srcStride];
2203
        const int src9 = src[9*srcStride];
2204
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2205
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2206
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2207
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2208
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2209
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2210
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2211
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2212
        src++;
2213
        dst++;
2214
    }
2215
}
2216

    
2217
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2218
    put_pixels8_c(dst, src, stride, 8);
2219
}
2220

    
2221
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2222
    uint8_t half[64];
2223
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2224
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2225
}
2226

    
2227
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2228
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2229
}
2230

    
2231
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2232
    uint8_t half[64];
2233
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2234
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2235
}
2236

    
2237
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2238
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2239
}
2240

    
2241
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2242
    uint8_t halfH[88];
2243
    uint8_t halfV[64];
2244
    uint8_t halfHV[64];
2245
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2246
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2247
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2248
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2249
}
2250
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2251
    uint8_t halfH[88];
2252
    uint8_t halfV[64];
2253
    uint8_t halfHV[64];
2254
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2255
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2256
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2257
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2258
}
2259
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2260
    uint8_t halfH[88];
2261
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2263
}
2264

    
2265
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2266
    int x;
2267
    const int strength= ff_h263_loop_filter_strength[qscale];
2268
    
2269
    for(x=0; x<8; x++){
2270
        int d1, d2, ad1;
2271
        int p0= src[x-2*stride];
2272
        int p1= src[x-1*stride];
2273
        int p2= src[x+0*stride];
2274
        int p3= src[x+1*stride];
2275
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2276

    
2277
        if     (d<-2*strength) d1= 0;
2278
        else if(d<-  strength) d1=-2*strength - d;
2279
        else if(d<   strength) d1= d;
2280
        else if(d< 2*strength) d1= 2*strength - d;
2281
        else                   d1= 0;
2282
        
2283
        p1 += d1;
2284
        p2 -= d1;
2285
        if(p1&256) p1= ~(p1>>31);
2286
        if(p2&256) p2= ~(p2>>31);
2287
        
2288
        src[x-1*stride] = p1;
2289
        src[x+0*stride] = p2;
2290

    
2291
        ad1= ABS(d1)>>1;
2292
        
2293
        d2= clip((p0-p3)/4, -ad1, ad1);
2294
        
2295
        src[x-2*stride] = p0 - d2;
2296
        src[x+  stride] = p3 + d2;
2297
    }
2298
}
2299

    
2300
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2301
    int y;
2302
    const int strength= ff_h263_loop_filter_strength[qscale];
2303
    
2304
    for(y=0; y<8; y++){
2305
        int d1, d2, ad1;
2306
        int p0= src[y*stride-2];
2307
        int p1= src[y*stride-1];
2308
        int p2= src[y*stride+0];
2309
        int p3= src[y*stride+1];
2310
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2311

    
2312
        if     (d<-2*strength) d1= 0;
2313
        else if(d<-  strength) d1=-2*strength - d;
2314
        else if(d<   strength) d1= d;
2315
        else if(d< 2*strength) d1= 2*strength - d;
2316
        else                   d1= 0;
2317
        
2318
        p1 += d1;
2319
        p2 -= d1;
2320
        if(p1&256) p1= ~(p1>>31);
2321
        if(p2&256) p2= ~(p2>>31);
2322
        
2323
        src[y*stride-1] = p1;
2324
        src[y*stride+0] = p2;
2325

    
2326
        ad1= ABS(d1)>>1;
2327
        
2328
        d2= clip((p0-p3)/4, -ad1, ad1);
2329
        
2330
        src[y*stride-2] = p0 - d2;
2331
        src[y*stride+1] = p3 + d2;
2332
    }
2333
}
2334

    
2335
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2336
{
2337
    int s, i;
2338

    
2339
    s = 0;
2340
    for(i=0;i<h;i++) {
2341
        s += abs(pix1[0] - pix2[0]);
2342
        s += abs(pix1[1] - pix2[1]);
2343
        s += abs(pix1[2] - pix2[2]);
2344
        s += abs(pix1[3] - pix2[3]);
2345
        s += abs(pix1[4] - pix2[4]);
2346
        s += abs(pix1[5] - pix2[5]);
2347
        s += abs(pix1[6] - pix2[6]);
2348
        s += abs(pix1[7] - pix2[7]);
2349
        s += abs(pix1[8] - pix2[8]);
2350
        s += abs(pix1[9] - pix2[9]);
2351
        s += abs(pix1[10] - pix2[10]);
2352
        s += abs(pix1[11] - pix2[11]);
2353
        s += abs(pix1[12] - pix2[12]);
2354
        s += abs(pix1[13] - pix2[13]);
2355
        s += abs(pix1[14] - pix2[14]);
2356
        s += abs(pix1[15] - pix2[15]);
2357
        pix1 += line_size;
2358
        pix2 += line_size;
2359
    }
2360
    return s;
2361
}
2362

    
2363
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2364
{
2365
    int s, i;
2366

    
2367
    s = 0;
2368
    for(i=0;i<h;i++) {
2369
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2370
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2371
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2372
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2373
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2374
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2375
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2376
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2377
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2378
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2379
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2380
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2381
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2382
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2383
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2384
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2385
        pix1 += line_size;
2386
        pix2 += line_size;
2387
    }
2388
    return s;
2389
}
2390

    
2391
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2392
{
2393
    int s, i;
2394
    uint8_t *pix3 = pix2 + line_size;
2395

    
2396
    s = 0;
2397
    for(i=0;i<h;i++) {
2398
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2399
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2400
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2401
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2402
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2403
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2404
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2405
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2406
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2407
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2408
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2409
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2410
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2411
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2412
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2413
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2414
        pix1 += line_size;
2415
        pix2 += line_size;
2416
        pix3 += line_size;
2417
    }
2418
    return s;
2419
}
2420

    
2421
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2422
{
2423
    int s, i;
2424
    uint8_t *pix3 = pix2 + line_size;
2425

    
2426
    s = 0;
2427
    for(i=0;i<h;i++) {
2428
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2429
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2430
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2431
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2432
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2433
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2434
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2435
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2436
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2437
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2438
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2439
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2440
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2441
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2442
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2443
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2444
        pix1 += line_size;
2445
        pix2 += line_size;
2446
        pix3 += line_size;
2447
    }
2448
    return s;
2449
}
2450

    
2451
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2452
{
2453
    int s, i;
2454

    
2455
    s = 0;
2456
    for(i=0;i<h;i++) {
2457
        s += abs(pix1[0] - pix2[0]);
2458
        s += abs(pix1[1] - pix2[1]);
2459
        s += abs(pix1[2] - pix2[2]);
2460
        s += abs(pix1[3] - pix2[3]);
2461
        s += abs(pix1[4] - pix2[4]);
2462
        s += abs(pix1[5] - pix2[5]);
2463
        s += abs(pix1[6] - pix2[6]);
2464
        s += abs(pix1[7] - pix2[7]);
2465
        pix1 += line_size;
2466
        pix2 += line_size;
2467
    }
2468
    return s;
2469
}
2470

    
2471
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2472
{
2473
    int s, i;
2474

    
2475
    s = 0;
2476
    for(i=0;i<h;i++) {
2477
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2478
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2479
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2480
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2481
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2482
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2483
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2484
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2485
        pix1 += line_size;
2486
        pix2 += line_size;
2487
    }
2488
    return s;
2489
}
2490

    
2491
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2492
{
2493
    int s, i;
2494
    uint8_t *pix3 = pix2 + line_size;
2495

    
2496
    s = 0;
2497
    for(i=0;i<h;i++) {
2498
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2499
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2500
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2501
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2502
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2503
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2504
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2505
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2506
        pix1 += line_size;
2507
        pix2 += line_size;
2508
        pix3 += line_size;
2509
    }
2510
    return s;
2511
}
2512

    
2513
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2514
{
2515
    int s, i;
2516
    uint8_t *pix3 = pix2 + line_size;
2517

    
2518
    s = 0;
2519
    for(i=0;i<h;i++) {
2520
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2521
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2522
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2523
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2524
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2525
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2526
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2527
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2528
        pix1 += line_size;
2529
        pix2 += line_size;
2530
        pix3 += line_size;
2531
    }
2532
    return s;
2533
}
2534

    
2535
/**
2536
 * permutes an 8x8 block.
2537
 * @param block the block which will be permuted according to the given permutation vector
2538
 * @param permutation the permutation vector
2539
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2540
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2541
 *                  (inverse) permutated to scantable order!
2542
 */
2543
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2544
{
2545
    int i;
2546
    DCTELEM temp[64];
2547
    
2548
    if(last<=0) return;
2549
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2550

    
2551
    for(i=0; i<=last; i++){
2552
        const int j= scantable[i];
2553
        temp[j]= block[j];
2554
        block[j]=0;
2555
    }
2556
    
2557
    for(i=0; i<=last; i++){
2558
        const int j= scantable[i];
2559
        const int perm_j= permutation[j];
2560
        block[perm_j]= temp[j];
2561
    }
2562
}
2563

    
2564
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2565
    return 0;
2566
}
2567

    
2568
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2569
    int i;
2570
    
2571
    memset(cmp, 0, sizeof(void*)*5);
2572
        
2573
    for(i=0; i<5; i++){
2574
        switch(type&0xFF){
2575
        case FF_CMP_SAD:
2576
            cmp[i]= c->sad[i];
2577
            break;
2578
        case FF_CMP_SATD:
2579
            cmp[i]= c->hadamard8_diff[i];
2580
            break;
2581
        case FF_CMP_SSE:
2582
            cmp[i]= c->sse[i];
2583
            break;
2584
        case FF_CMP_DCT:
2585
            cmp[i]= c->dct_sad[i];
2586
            break;
2587
        case FF_CMP_PSNR:
2588
            cmp[i]= c->quant_psnr[i];
2589
            break;
2590
        case FF_CMP_BIT:
2591
            cmp[i]= c->bit[i];
2592
            break;
2593
        case FF_CMP_RD:
2594
            cmp[i]= c->rd[i];
2595
            break;
2596
        case FF_CMP_VSAD:
2597
            cmp[i]= c->vsad[i];
2598
            break;
2599
        case FF_CMP_VSSE:
2600
            cmp[i]= c->vsse[i];
2601
            break;
2602
        case FF_CMP_ZERO:
2603
            cmp[i]= zero_cmp;
2604
            break;
2605
        default:
2606
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2607
        }
2608
    }
2609
}
2610

    
2611
/**
2612
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2613
 */
2614
static void clear_blocks_c(DCTELEM *blocks)
2615
{
2616
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2617
}
2618

    
2619
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2620
    int i;
2621
    for(i=0; i+7<w; i+=8){
2622
        dst[i+0] += src[i+0];
2623
        dst[i+1] += src[i+1];
2624
        dst[i+2] += src[i+2];
2625
        dst[i+3] += src[i+3];
2626
        dst[i+4] += src[i+4];
2627
        dst[i+5] += src[i+5];
2628
        dst[i+6] += src[i+6];
2629
        dst[i+7] += src[i+7];
2630
    }
2631
    for(; i<w; i++)
2632
        dst[i+0] += src[i+0];
2633
}
2634

    
2635
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2636
    int i;
2637
    for(i=0; i+7<w; i+=8){
2638
        dst[i+0] = src1[i+0]-src2[i+0];
2639
        dst[i+1] = src1[i+1]-src2[i+1];
2640
        dst[i+2] = src1[i+2]-src2[i+2];
2641
        dst[i+3] = src1[i+3]-src2[i+3];
2642
        dst[i+4] = src1[i+4]-src2[i+4];
2643
        dst[i+5] = src1[i+5]-src2[i+5];
2644
        dst[i+6] = src1[i+6]-src2[i+6];
2645
        dst[i+7] = src1[i+7]-src2[i+7];
2646
    }
2647
    for(; i<w; i++)
2648
        dst[i+0] = src1[i+0]-src2[i+0];
2649
}
2650

    
2651
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2652
    int i;
2653
    uint8_t l, lt;
2654

    
2655
    l= *left;
2656
    lt= *left_top;
2657

    
2658
    for(i=0; i<w; i++){
2659
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2660
        lt= src1[i];
2661
        l= src2[i];
2662
        dst[i]= l - pred;
2663
    }    
2664

    
2665
    *left= l;
2666
    *left_top= lt;
2667
}
2668

    
2669
#define BUTTERFLY2(o1,o2,i1,i2) \
2670
o1= (i1)+(i2);\
2671
o2= (i1)-(i2);
2672

    
2673
#define BUTTERFLY1(x,y) \
2674
{\
2675
    int a,b;\
2676
    a= x;\
2677
    b= y;\
2678
    x= a+b;\
2679
    y= a-b;\
2680
}
2681

    
2682
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2683

    
2684
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2685
    int i;
2686
    int temp[64];
2687
    int sum=0;
2688
    
2689
    assert(h==8);
2690

    
2691
    for(i=0; i<8; i++){
2692
        //FIXME try pointer walks
2693
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2694
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2695
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2696
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2697
        
2698
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2699
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2700
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2701
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2702
        
2703
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2704
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2705
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2706
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2707
    }
2708

    
2709
    for(i=0; i<8; i++){
2710
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2711
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2712
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2713
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2714
        
2715
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2716
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2717
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2718
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2719

    
2720
        sum += 
2721
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2722
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2723
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2724
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2725
    }
2726
#if 0
2727
static int maxi=0;
2728
if(sum>maxi){
2729
    maxi=sum;
2730
    printf("MAX:%d\n", maxi);
2731
}
2732
#endif
2733
    return sum;
2734
}
2735

    
2736
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2737
    int i;
2738
    int temp[64];
2739
    int sum=0;
2740
    
2741
    assert(h==8);
2742
    
2743
    for(i=0; i<8; i++){
2744
        //FIXME try pointer walks
2745
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2746
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2747
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2748
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2749
        
2750
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2751
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2752
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2753
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2754
        
2755
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2756
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2757
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2758
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2759
    }
2760

    
2761
    for(i=0; i<8; i++){
2762
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2763
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2764
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2765
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2766
        
2767
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2768
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2769
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2770
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2771
    
2772
        sum += 
2773
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2774
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2775
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2776
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2777
    }
2778
    
2779
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2780
    
2781
    return sum;
2782
}
2783

    
2784
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2785
    MpegEncContext * const s= (MpegEncContext *)c;
2786
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2788
    int sum=0, i;
2789
    
2790
    assert(h==8);
2791

    
2792
    s->dsp.diff_pixels(temp, src1, src2, stride);
2793
    s->dsp.fdct(temp);
2794

    
2795
    for(i=0; i<64; i++)
2796
        sum+= ABS(temp[i]);
2797
        
2798
    return sum;
2799
}
2800

    
2801
void simple_idct(DCTELEM *block); //FIXME
2802

    
2803
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2804
    MpegEncContext * const s= (MpegEncContext *)c;
2805
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2806
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2807
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2808
    int sum=0, i;
2809

    
2810
    assert(h==8);
2811
    s->mb_intra=0;
2812
    
2813
    s->dsp.diff_pixels(temp, src1, src2, stride);
2814
    
2815
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2816
    
2817
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2818
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2819
    simple_idct(temp); //FIXME 
2820
    
2821
    for(i=0; i<64; i++)
2822
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2823
        
2824
    return sum;
2825
}
2826

    
2827
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2828
    MpegEncContext * const s= (MpegEncContext *)c;
2829
    const uint8_t *scantable= s->intra_scantable.permutated;
2830
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2831
    uint64_t __align8 aligned_bak[stride];
2832
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2833
    uint8_t * const bak= (uint8_t*)aligned_bak;
2834
    int i, last, run, bits, level, distoration, start_i;
2835
    const int esc_length= s->ac_esc_length;
2836
    uint8_t * length;
2837
    uint8_t * last_length;
2838
    
2839
    assert(h==8);
2840

    
2841
    for(i=0; i<8; i++){
2842
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2843
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2844
    }
2845

    
2846
    s->dsp.diff_pixels(temp, src1, src2, stride);
2847

    
2848
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2849

    
2850
    bits=0;
2851
    
2852
    if (s->mb_intra) {
2853
        start_i = 1; 
2854
        length     = s->intra_ac_vlc_length;
2855
        last_length= s->intra_ac_vlc_last_length;
2856
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2857
    } else {
2858
        start_i = 0;
2859
        length     = s->inter_ac_vlc_length;
2860
        last_length= s->inter_ac_vlc_last_length;
2861
    }
2862
    
2863
    if(last>=start_i){
2864
        run=0;
2865
        for(i=start_i; i<last; i++){
2866
            int j= scantable[i];
2867
            level= temp[j];
2868
        
2869
            if(level){
2870
                level+=64;
2871
                if((level&(~127)) == 0){
2872
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2873
                }else
2874
                    bits+= esc_length;
2875
                run=0;
2876
            }else
2877
                run++;
2878
        }
2879
        i= scantable[last];
2880
       
2881
        level= temp[i] + 64;
2882

    
2883
        assert(level - 64);
2884
        
2885
        if((level&(~127)) == 0){
2886
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2887
        }else
2888
            bits+= esc_length;
2889
    
2890
    }
2891

    
2892
    if(last>=0){
2893
        if(s->mb_intra)
2894
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2895
        else
2896
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2897
    }
2898
    
2899
    s->dsp.idct_add(bak, stride, temp);
2900
    
2901
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2902

    
2903
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2904
}
2905

    
2906
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2907
    MpegEncContext * const s= (MpegEncContext *)c;
2908
    const uint8_t *scantable= s->intra_scantable.permutated;
2909
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2910
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2911
    int i, last, run, bits, level, start_i;
2912
    const int esc_length= s->ac_esc_length;
2913
    uint8_t * length;
2914
    uint8_t * last_length;
2915

    
2916
    assert(h==8);
2917
    
2918
    s->dsp.diff_pixels(temp, src1, src2, stride);
2919

    
2920
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2921

    
2922
    bits=0;
2923
    
2924
    if (s->mb_intra) {
2925
        start_i = 1; 
2926
        length     = s->intra_ac_vlc_length;
2927
        last_length= s->intra_ac_vlc_last_length;
2928
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2929
    } else {
2930
        start_i = 0;
2931
        length     = s->inter_ac_vlc_length;
2932
        last_length= s->inter_ac_vlc_last_length;
2933
    }
2934
    
2935
    if(last>=start_i){
2936
        run=0;
2937
        for(i=start_i; i<last; i++){
2938
            int j= scantable[i];
2939
            level= temp[j];
2940
        
2941
            if(level){
2942
                level+=64;
2943
                if((level&(~127)) == 0){
2944
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2945
                }else
2946
                    bits+= esc_length;
2947
                run=0;
2948
            }else
2949
                run++;
2950
        }
2951
        i= scantable[last];
2952
                
2953
        level= temp[i] + 64;
2954
        
2955
        assert(level - 64);
2956
        
2957
        if((level&(~127)) == 0){
2958
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2959
        }else
2960
            bits+= esc_length;
2961
    }
2962

    
2963
    return bits;
2964
}
2965

    
2966
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2967
    int score=0;
2968
    int x,y;
2969
    
2970
    for(y=1; y<h; y++){
2971
        for(x=0; x<16; x+=4){
2972
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride]) 
2973
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
2974
        }
2975
        s+= stride;
2976
    }
2977
    
2978
    return score;
2979
}
2980

    
2981
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2982
    int score=0;
2983
    int x,y;
2984
    
2985
    for(y=1; y<h; y++){
2986
        for(x=0; x<16; x++){
2987
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2988
        }
2989
        s1+= stride;
2990
        s2+= stride;
2991
    }
2992
    
2993
    return score;
2994
}
2995

    
2996
#define SQ(a) ((a)*(a))
2997
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2998
    int score=0;
2999
    int x,y;
3000
    
3001
    for(y=1; y<h; y++){
3002
        for(x=0; x<16; x+=4){
3003
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
3004
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3005
        }
3006
        s+= stride;
3007
    }
3008
    
3009
    return score;
3010
}
3011

    
3012
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3013
    int score=0;
3014
    int x,y;
3015
    
3016
    for(y=1; y<h; y++){
3017
        for(x=0; x<16; x++){
3018
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3019
        }
3020
        s1+= stride;
3021
        s2+= stride;
3022
    }
3023
    
3024
    return score;
3025
}
3026

    
3027
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3028
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3029
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3030
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3031
WARPER8_16_SQ(rd8x8_c, rd16_c)
3032
WARPER8_16_SQ(bit8x8_c, bit16_c)
3033

    
3034
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3035
 converted */
3036
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3037
{
3038
    j_rev_dct (block);
3039
    put_pixels_clamped_c(block, dest, line_size);
3040
}
3041
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3042
{
3043
    j_rev_dct (block);
3044
    add_pixels_clamped_c(block, dest, line_size);
3045
}
3046

    
3047
/* init static data */
3048
void dsputil_static_init(void)
3049
{
3050
    int i;
3051

    
3052
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3053
    for(i=0;i<MAX_NEG_CROP;i++) {
3054
        cropTbl[i] = 0;
3055
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3056
    }
3057
    
3058
    for(i=0;i<512;i++) {
3059
        squareTbl[i] = (i - 256) * (i - 256);
3060
    }
3061
    
3062
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3063
}
3064

    
3065

    
3066
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3067
{
3068
    int i;
3069

    
3070
#ifdef CONFIG_ENCODERS
3071
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3072
        c->fdct = fdct_ifast;
3073
        c->fdct248 = fdct_ifast248;
3074
    } 
3075
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3076
        c->fdct = ff_faandct;
3077
        c->fdct248 = ff_faandct248; 
3078
    } 
3079
    else {
3080
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3081
        c->fdct248 = ff_fdct248_islow;
3082
    }
3083
#endif //CONFIG_ENCODERS
3084

    
3085
    if(avctx->idct_algo==FF_IDCT_INT){
3086
        c->idct_put= ff_jref_idct_put;
3087
        c->idct_add= ff_jref_idct_add;
3088
        c->idct    = j_rev_dct;
3089
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3090
    }else{ //accurate/default
3091
        c->idct_put= simple_idct_put;
3092
        c->idct_add= simple_idct_add;
3093
        c->idct    = simple_idct;
3094
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3095
    }
3096

    
3097
    c->get_pixels = get_pixels_c;
3098
    c->diff_pixels = diff_pixels_c;
3099
    c->put_pixels_clamped = put_pixels_clamped_c;
3100
    c->add_pixels_clamped = add_pixels_clamped_c;
3101
    c->gmc1 = gmc1_c;
3102
    c->gmc = gmc_c;
3103
    c->clear_blocks = clear_blocks_c;
3104
    c->pix_sum = pix_sum_c;
3105
    c->pix_norm1 = pix_norm1_c;
3106

    
3107
    /* TODO [0] 16  [1] 8 */
3108
    c->pix_abs[0][0] = pix_abs16_c;
3109
    c->pix_abs[0][1] = pix_abs16_x2_c;
3110
    c->pix_abs[0][2] = pix_abs16_y2_c;
3111
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3112
    c->pix_abs[1][0] = pix_abs8_c;
3113
    c->pix_abs[1][1] = pix_abs8_x2_c;
3114
    c->pix_abs[1][2] = pix_abs8_y2_c;
3115
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3116

    
3117
#define dspfunc(PFX, IDX, NUM) \
3118
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3119
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3120
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3121
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3122

    
3123
    dspfunc(put, 0, 16);
3124
    dspfunc(put_no_rnd, 0, 16);
3125
    dspfunc(put, 1, 8);
3126
    dspfunc(put_no_rnd, 1, 8);
3127
    dspfunc(put, 2, 4);
3128
    dspfunc(put, 3, 2);
3129

    
3130
    dspfunc(avg, 0, 16);
3131
    dspfunc(avg_no_rnd, 0, 16);
3132
    dspfunc(avg, 1, 8);
3133
    dspfunc(avg_no_rnd, 1, 8);
3134
    dspfunc(avg, 2, 4);
3135
    dspfunc(avg, 3, 2);
3136
#undef dspfunc
3137

    
3138
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3139
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3140
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3141
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3142
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3143
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3144
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3145
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3146
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3147

    
3148
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3149
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3150
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3151
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3152
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3153
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3154
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3155
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3156
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3157

    
3158
#define dspfunc(PFX, IDX, NUM) \
3159
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3160
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3161
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3162
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3163
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3164
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3165
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3166
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3167
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3168
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3169
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3170
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3171
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3172
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3173
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3174
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3175

    
3176
    dspfunc(put_qpel, 0, 16);
3177
    dspfunc(put_no_rnd_qpel, 0, 16);
3178

    
3179
    dspfunc(avg_qpel, 0, 16);
3180
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3181

    
3182
    dspfunc(put_qpel, 1, 8);
3183
    dspfunc(put_no_rnd_qpel, 1, 8);
3184

    
3185
    dspfunc(avg_qpel, 1, 8);
3186
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3187

    
3188
    dspfunc(put_h264_qpel, 0, 16);
3189
    dspfunc(put_h264_qpel, 1, 8);
3190
    dspfunc(put_h264_qpel, 2, 4);
3191
    dspfunc(avg_h264_qpel, 0, 16);
3192
    dspfunc(avg_h264_qpel, 1, 8);
3193
    dspfunc(avg_h264_qpel, 2, 4);
3194

    
3195
#undef dspfunc
3196
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3197
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3198
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3199
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3200
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3201
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3202

    
3203
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3204
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3205
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3206
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3207
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3208
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3209
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3210
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3211
        
3212
#define SET_CMP_FUNC(name) \
3213
    c->name[0]= name ## 16_c;\
3214
    c->name[1]= name ## 8x8_c;
3215
    
3216
    SET_CMP_FUNC(hadamard8_diff)
3217
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3218
    SET_CMP_FUNC(dct_sad)
3219
    c->sad[0]= pix_abs16_c;
3220
    c->sad[1]= pix_abs8_c;
3221
    c->sse[0]= sse16_c;
3222
    c->sse[1]= sse8_c;
3223
    SET_CMP_FUNC(quant_psnr)
3224
    SET_CMP_FUNC(rd)
3225
    SET_CMP_FUNC(bit)
3226
    c->vsad[0]= vsad16_c;
3227
    c->vsad[4]= vsad_intra16_c;
3228
    c->vsse[0]= vsse16_c;
3229
    c->vsse[4]= vsse_intra16_c;
3230
        
3231
    c->add_bytes= add_bytes_c;
3232
    c->diff_bytes= diff_bytes_c;
3233
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3234
    c->bswap_buf= bswap_buf;
3235
    
3236
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3237
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3238

    
3239
#ifdef HAVE_MMX
3240
    dsputil_init_mmx(c, avctx);
3241
#endif
3242
#ifdef ARCH_ARMV4L
3243
    dsputil_init_armv4l(c, avctx);
3244
#endif
3245
#ifdef HAVE_MLIB
3246
    dsputil_init_mlib(c, avctx);
3247
#endif
3248
#ifdef ARCH_ALPHA
3249
    dsputil_init_alpha(c, avctx);
3250
#endif
3251
#ifdef ARCH_POWERPC
3252
    dsputil_init_ppc(c, avctx);
3253
#endif
3254
#ifdef HAVE_MMI
3255
    dsputil_init_mmi(c, avctx);
3256
#endif
3257
#ifdef ARCH_SH4
3258
    dsputil_init_sh4(c,avctx);
3259
#endif
3260

    
3261
    switch(c->idct_permutation_type){
3262
    case FF_NO_IDCT_PERM:
3263
        for(i=0; i<64; i++)
3264
            c->idct_permutation[i]= i;
3265
        break;
3266
    case FF_LIBMPEG2_IDCT_PERM:
3267
        for(i=0; i<64; i++)
3268
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3269
        break;
3270
    case FF_SIMPLE_IDCT_PERM:
3271
        for(i=0; i<64; i++)
3272
            c->idct_permutation[i]= simple_mmx_permutation[i];
3273
        break;
3274
    case FF_TRANSPOSE_IDCT_PERM:
3275
        for(i=0; i<64; i++)
3276
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3277
        break;
3278
    default:
3279
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3280
    }
3281
}
3282