Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 622348f9

History | View | Annotate | Download (120 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18
 *
19
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20
 */
21
 
22
/**
23
 * @file dsputil.c
24
 * DSP utils
25
 */
26
 
27
#include "avcodec.h"
28
#include "dsputil.h"
29
#include "mpegvideo.h"
30
#include "simple_idct.h"
31
#include "faandct.h"
32

    
33
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34
uint32_t squareTbl[512];
35

    
36
const uint8_t ff_zigzag_direct[64] = {
37
    0,   1,  8, 16,  9,  2,  3, 10,
38
    17, 24, 32, 25, 18, 11,  4,  5,
39
    12, 19, 26, 33, 40, 48, 41, 34,
40
    27, 20, 13,  6,  7, 14, 21, 28,
41
    35, 42, 49, 56, 57, 50, 43, 36,
42
    29, 22, 15, 23, 30, 37, 44, 51,
43
    58, 59, 52, 45, 38, 31, 39, 46,
44
    53, 60, 61, 54, 47, 55, 62, 63
45
};
46

    
47
/* Specific zigzag scan for 248 idct. NOTE that unlike the
48
   specification, we interleave the fields */
49
const uint8_t ff_zigzag248_direct[64] = {
50
     0,  8,  1,  9, 16, 24,  2, 10,
51
    17, 25, 32, 40, 48, 56, 33, 41,
52
    18, 26,  3, 11,  4, 12, 19, 27,
53
    34, 42, 49, 57, 50, 58, 35, 43,
54
    20, 28,  5, 13,  6, 14, 21, 29,
55
    36, 44, 51, 59, 52, 60, 37, 45,
56
    22, 30,  7, 15, 23, 31, 38, 46,
57
    53, 61, 54, 62, 39, 47, 55, 63,
58
};
59

    
60
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
61
uint16_t __align8 inv_zigzag_direct16[64];
62

    
63
const uint8_t ff_alternate_horizontal_scan[64] = {
64
    0,  1,   2,  3,  8,  9, 16, 17, 
65
    10, 11,  4,  5,  6,  7, 15, 14,
66
    13, 12, 19, 18, 24, 25, 32, 33, 
67
    26, 27, 20, 21, 22, 23, 28, 29,
68
    30, 31, 34, 35, 40, 41, 48, 49, 
69
    42, 43, 36, 37, 38, 39, 44, 45,
70
    46, 47, 50, 51, 56, 57, 58, 59, 
71
    52, 53, 54, 55, 60, 61, 62, 63,
72
};
73

    
74
const uint8_t ff_alternate_vertical_scan[64] = {
75
    0,  8,  16, 24,  1,  9,  2, 10, 
76
    17, 25, 32, 40, 48, 56, 57, 49,
77
    41, 33, 26, 18,  3, 11,  4, 12, 
78
    19, 27, 34, 42, 50, 58, 35, 43,
79
    51, 59, 20, 28,  5, 13,  6, 14, 
80
    21, 29, 36, 44, 52, 60, 37, 45,
81
    53, 61, 22, 30,  7, 15, 23, 31, 
82
    38, 46, 54, 62, 39, 47, 55, 63,
83
};
84

    
85
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
86
const uint32_t inverse[256]={
87
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
88
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
89
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
90
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
91
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
92
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
93
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
94
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
95
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
96
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
97
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
98
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
99
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
100
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
101
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
102
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
103
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
104
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
105
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
106
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
107
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
108
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
109
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
110
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
111
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
112
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
113
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
114
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
115
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
116
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
117
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
118
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
119
};
120

    
121
/* Input permutation for the simple_idct_mmx */
122
static const uint8_t simple_mmx_permutation[64]={
123
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
124
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
125
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
126
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
127
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
128
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
129
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
130
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
131
};
132

    
133
static int pix_sum_c(uint8_t * pix, int line_size)
134
{
135
    int s, i, j;
136

    
137
    s = 0;
138
    for (i = 0; i < 16; i++) {
139
        for (j = 0; j < 16; j += 8) {
140
            s += pix[0];
141
            s += pix[1];
142
            s += pix[2];
143
            s += pix[3];
144
            s += pix[4];
145
            s += pix[5];
146
            s += pix[6];
147
            s += pix[7];
148
            pix += 8;
149
        }
150
        pix += line_size - 16;
151
    }
152
    return s;
153
}
154

    
155
static int pix_norm1_c(uint8_t * pix, int line_size)
156
{
157
    int s, i, j;
158
    uint32_t *sq = squareTbl + 256;
159

    
160
    s = 0;
161
    for (i = 0; i < 16; i++) {
162
        for (j = 0; j < 16; j += 8) {
163
#if 0
164
            s += sq[pix[0]];
165
            s += sq[pix[1]];
166
            s += sq[pix[2]];
167
            s += sq[pix[3]];
168
            s += sq[pix[4]];
169
            s += sq[pix[5]];
170
            s += sq[pix[6]];
171
            s += sq[pix[7]];
172
#else
173
#if LONG_MAX > 2147483647
174
            register uint64_t x=*(uint64_t*)pix;
175
            s += sq[x&0xff];
176
            s += sq[(x>>8)&0xff];
177
            s += sq[(x>>16)&0xff];
178
            s += sq[(x>>24)&0xff];
179
            s += sq[(x>>32)&0xff];
180
            s += sq[(x>>40)&0xff];
181
            s += sq[(x>>48)&0xff];
182
            s += sq[(x>>56)&0xff];
183
#else
184
            register uint32_t x=*(uint32_t*)pix;
185
            s += sq[x&0xff];
186
            s += sq[(x>>8)&0xff];
187
            s += sq[(x>>16)&0xff];
188
            s += sq[(x>>24)&0xff];
189
            x=*(uint32_t*)(pix+4);
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
#endif
195
#endif
196
            pix += 8;
197
        }
198
        pix += line_size - 16;
199
    }
200
    return s;
201
}
202

    
203
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
204
    int i;
205
    
206
    for(i=0; i+8<=w; i+=8){
207
        dst[i+0]= bswap_32(src[i+0]);
208
        dst[i+1]= bswap_32(src[i+1]);
209
        dst[i+2]= bswap_32(src[i+2]);
210
        dst[i+3]= bswap_32(src[i+3]);
211
        dst[i+4]= bswap_32(src[i+4]);
212
        dst[i+5]= bswap_32(src[i+5]);
213
        dst[i+6]= bswap_32(src[i+6]);
214
        dst[i+7]= bswap_32(src[i+7]);
215
    }
216
    for(;i<w; i++){
217
        dst[i+0]= bswap_32(src[i+0]);
218
    }
219
}
220

    
221
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
222
{
223
    int s, i;
224
    uint32_t *sq = squareTbl + 256;
225

    
226
    s = 0;
227
    for (i = 0; i < h; i++) {
228
        s += sq[pix1[0] - pix2[0]];
229
        s += sq[pix1[1] - pix2[1]];
230
        s += sq[pix1[2] - pix2[2]];
231
        s += sq[pix1[3] - pix2[3]];
232
        s += sq[pix1[4] - pix2[4]];
233
        s += sq[pix1[5] - pix2[5]];
234
        s += sq[pix1[6] - pix2[6]];
235
        s += sq[pix1[7] - pix2[7]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[ 0] - pix2[ 0]];
250
        s += sq[pix1[ 1] - pix2[ 1]];
251
        s += sq[pix1[ 2] - pix2[ 2]];
252
        s += sq[pix1[ 3] - pix2[ 3]];
253
        s += sq[pix1[ 4] - pix2[ 4]];
254
        s += sq[pix1[ 5] - pix2[ 5]];
255
        s += sq[pix1[ 6] - pix2[ 6]];
256
        s += sq[pix1[ 7] - pix2[ 7]];
257
        s += sq[pix1[ 8] - pix2[ 8]];
258
        s += sq[pix1[ 9] - pix2[ 9]];
259
        s += sq[pix1[10] - pix2[10]];
260
        s += sq[pix1[11] - pix2[11]];
261
        s += sq[pix1[12] - pix2[12]];
262
        s += sq[pix1[13] - pix2[13]];
263
        s += sq[pix1[14] - pix2[14]];
264
        s += sq[pix1[15] - pix2[15]];
265

    
266
        pix1 += line_size;
267
        pix2 += line_size;
268
    }
269
    return s;
270
}
271

    
272
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
273
{
274
    int i;
275

    
276
    /* read the pixels */
277
    for(i=0;i<8;i++) {
278
        block[0] = pixels[0];
279
        block[1] = pixels[1];
280
        block[2] = pixels[2];
281
        block[3] = pixels[3];
282
        block[4] = pixels[4];
283
        block[5] = pixels[5];
284
        block[6] = pixels[6];
285
        block[7] = pixels[7];
286
        pixels += line_size;
287
        block += 8;
288
    }
289
}
290

    
291
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
292
                          const uint8_t *s2, int stride){
293
    int i;
294

    
295
    /* read the pixels */
296
    for(i=0;i<8;i++) {
297
        block[0] = s1[0] - s2[0];
298
        block[1] = s1[1] - s2[1];
299
        block[2] = s1[2] - s2[2];
300
        block[3] = s1[3] - s2[3];
301
        block[4] = s1[4] - s2[4];
302
        block[5] = s1[5] - s2[5];
303
        block[6] = s1[6] - s2[6];
304
        block[7] = s1[7] - s2[7];
305
        s1 += stride;
306
        s2 += stride;
307
        block += 8;
308
    }
309
}
310

    
311

    
312
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
313
                                 int line_size)
314
{
315
    int i;
316
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
317
    
318
    /* read the pixels */
319
    for(i=0;i<8;i++) {
320
        pixels[0] = cm[block[0]];
321
        pixels[1] = cm[block[1]];
322
        pixels[2] = cm[block[2]];
323
        pixels[3] = cm[block[3]];
324
        pixels[4] = cm[block[4]];
325
        pixels[5] = cm[block[5]];
326
        pixels[6] = cm[block[6]];
327
        pixels[7] = cm[block[7]];
328

    
329
        pixels += line_size;
330
        block += 8;
331
    }
332
}
333

    
334
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335
                          int line_size)
336
{
337
    int i;
338
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
339
    
340
    /* read the pixels */
341
    for(i=0;i<8;i++) {
342
        pixels[0] = cm[pixels[0] + block[0]];
343
        pixels[1] = cm[pixels[1] + block[1]];
344
        pixels[2] = cm[pixels[2] + block[2]];
345
        pixels[3] = cm[pixels[3] + block[3]];
346
        pixels[4] = cm[pixels[4] + block[4]];
347
        pixels[5] = cm[pixels[5] + block[5]];
348
        pixels[6] = cm[pixels[6] + block[6]];
349
        pixels[7] = cm[pixels[7] + block[7]];
350
        pixels += line_size;
351
        block += 8;
352
    }
353
}
354
#if 0
355

356
#define PIXOP2(OPNAME, OP) \
357
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358
{\
359
    int i;\
360
    for(i=0; i<h; i++){\
361
        OP(*((uint64_t*)block), LD64(pixels));\
362
        pixels+=line_size;\
363
        block +=line_size;\
364
    }\
365
}\
366
\
367
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368
{\
369
    int i;\
370
    for(i=0; i<h; i++){\
371
        const uint64_t a= LD64(pixels  );\
372
        const uint64_t b= LD64(pixels+1);\
373
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
374
        pixels+=line_size;\
375
        block +=line_size;\
376
    }\
377
}\
378
\
379
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380
{\
381
    int i;\
382
    for(i=0; i<h; i++){\
383
        const uint64_t a= LD64(pixels  );\
384
        const uint64_t b= LD64(pixels+1);\
385
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
386
        pixels+=line_size;\
387
        block +=line_size;\
388
    }\
389
}\
390
\
391
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
392
{\
393
    int i;\
394
    for(i=0; i<h; i++){\
395
        const uint64_t a= LD64(pixels          );\
396
        const uint64_t b= LD64(pixels+line_size);\
397
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
398
        pixels+=line_size;\
399
        block +=line_size;\
400
    }\
401
}\
402
\
403
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
404
{\
405
    int i;\
406
    for(i=0; i<h; i++){\
407
        const uint64_t a= LD64(pixels          );\
408
        const uint64_t b= LD64(pixels+line_size);\
409
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
410
        pixels+=line_size;\
411
        block +=line_size;\
412
    }\
413
}\
414
\
415
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
416
{\
417
        int i;\
418
        const uint64_t a= LD64(pixels  );\
419
        const uint64_t b= LD64(pixels+1);\
420
        uint64_t l0=  (a&0x0303030303030303ULL)\
421
                    + (b&0x0303030303030303ULL)\
422
                    + 0x0202020202020202ULL;\
423
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
425
        uint64_t l1,h1;\
426
\
427
        pixels+=line_size;\
428
        for(i=0; i<h; i+=2){\
429
            uint64_t a= LD64(pixels  );\
430
            uint64_t b= LD64(pixels+1);\
431
            l1=  (a&0x0303030303030303ULL)\
432
               + (b&0x0303030303030303ULL);\
433
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
434
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
435
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
436
            pixels+=line_size;\
437
            block +=line_size;\
438
            a= LD64(pixels  );\
439
            b= LD64(pixels+1);\
440
            l0=  (a&0x0303030303030303ULL)\
441
               + (b&0x0303030303030303ULL)\
442
               + 0x0202020202020202ULL;\
443
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
444
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
445
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446
            pixels+=line_size;\
447
            block +=line_size;\
448
        }\
449
}\
450
\
451
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452
{\
453
        int i;\
454
        const uint64_t a= LD64(pixels  );\
455
        const uint64_t b= LD64(pixels+1);\
456
        uint64_t l0=  (a&0x0303030303030303ULL)\
457
                    + (b&0x0303030303030303ULL)\
458
                    + 0x0101010101010101ULL;\
459
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
461
        uint64_t l1,h1;\
462
\
463
        pixels+=line_size;\
464
        for(i=0; i<h; i+=2){\
465
            uint64_t a= LD64(pixels  );\
466
            uint64_t b= LD64(pixels+1);\
467
            l1=  (a&0x0303030303030303ULL)\
468
               + (b&0x0303030303030303ULL);\
469
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
470
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
471
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
472
            pixels+=line_size;\
473
            block +=line_size;\
474
            a= LD64(pixels  );\
475
            b= LD64(pixels+1);\
476
            l0=  (a&0x0303030303030303ULL)\
477
               + (b&0x0303030303030303ULL)\
478
               + 0x0101010101010101ULL;\
479
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
480
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
481
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
482
            pixels+=line_size;\
483
            block +=line_size;\
484
        }\
485
}\
486
\
487
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
488
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
489
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
490
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
491
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
492
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
493
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
494

495
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
496
#else // 64 bit variant
497

    
498
#define PIXOP2(OPNAME, OP) \
499
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
500
    int i;\
501
    for(i=0; i<h; i++){\
502
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
503
        pixels+=line_size;\
504
        block +=line_size;\
505
    }\
506
}\
507
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
508
    int i;\
509
    for(i=0; i<h; i++){\
510
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
511
        pixels+=line_size;\
512
        block +=line_size;\
513
    }\
514
}\
515
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
516
    int i;\
517
    for(i=0; i<h; i++){\
518
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
519
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
520
        pixels+=line_size;\
521
        block +=line_size;\
522
    }\
523
}\
524
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
525
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
526
}\
527
\
528
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
529
                                                int src_stride1, int src_stride2, int h){\
530
    int i;\
531
    for(i=0; i<h; i++){\
532
        uint32_t a,b;\
533
        a= LD32(&src1[i*src_stride1  ]);\
534
        b= LD32(&src2[i*src_stride2  ]);\
535
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
536
        a= LD32(&src1[i*src_stride1+4]);\
537
        b= LD32(&src2[i*src_stride2+4]);\
538
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
539
    }\
540
}\
541
\
542
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
543
                                                int src_stride1, int src_stride2, int h){\
544
    int i;\
545
    for(i=0; i<h; i++){\
546
        uint32_t a,b;\
547
        a= LD32(&src1[i*src_stride1  ]);\
548
        b= LD32(&src2[i*src_stride2  ]);\
549
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
550
        a= LD32(&src1[i*src_stride1+4]);\
551
        b= LD32(&src2[i*src_stride2+4]);\
552
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
553
    }\
554
}\
555
\
556
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
557
                                                int src_stride1, int src_stride2, int h){\
558
    int i;\
559
    for(i=0; i<h; i++){\
560
        uint32_t a,b;\
561
        a= LD32(&src1[i*src_stride1  ]);\
562
        b= LD32(&src2[i*src_stride2  ]);\
563
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
564
    }\
565
}\
566
\
567
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
568
                                                int src_stride1, int src_stride2, int h){\
569
    int i;\
570
    for(i=0; i<h; i++){\
571
        uint32_t a,b;\
572
        a= LD16(&src1[i*src_stride1  ]);\
573
        b= LD16(&src2[i*src_stride2  ]);\
574
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
575
    }\
576
}\
577
\
578
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579
                                                int src_stride1, int src_stride2, int h){\
580
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
581
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
582
}\
583
\
584
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
585
                                                int src_stride1, int src_stride2, int h){\
586
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
587
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
588
}\
589
\
590
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
591
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
592
}\
593
\
594
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
595
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
596
}\
597
\
598
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
599
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
600
}\
601
\
602
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
603
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
604
}\
605
\
606
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
607
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
608
    int i;\
609
    for(i=0; i<h; i++){\
610
        uint32_t a, b, c, d, l0, l1, h0, h1;\
611
        a= LD32(&src1[i*src_stride1]);\
612
        b= LD32(&src2[i*src_stride2]);\
613
        c= LD32(&src3[i*src_stride3]);\
614
        d= LD32(&src4[i*src_stride4]);\
615
        l0=  (a&0x03030303UL)\
616
           + (b&0x03030303UL)\
617
           + 0x02020202UL;\
618
        h0= ((a&0xFCFCFCFCUL)>>2)\
619
          + ((b&0xFCFCFCFCUL)>>2);\
620
        l1=  (c&0x03030303UL)\
621
           + (d&0x03030303UL);\
622
        h1= ((c&0xFCFCFCFCUL)>>2)\
623
          + ((d&0xFCFCFCFCUL)>>2);\
624
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625
        a= LD32(&src1[i*src_stride1+4]);\
626
        b= LD32(&src2[i*src_stride2+4]);\
627
        c= LD32(&src3[i*src_stride3+4]);\
628
        d= LD32(&src4[i*src_stride4+4]);\
629
        l0=  (a&0x03030303UL)\
630
           + (b&0x03030303UL)\
631
           + 0x02020202UL;\
632
        h0= ((a&0xFCFCFCFCUL)>>2)\
633
          + ((b&0xFCFCFCFCUL)>>2);\
634
        l1=  (c&0x03030303UL)\
635
           + (d&0x03030303UL);\
636
        h1= ((c&0xFCFCFCFCUL)>>2)\
637
          + ((d&0xFCFCFCFCUL)>>2);\
638
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
639
    }\
640
}\
641
\
642
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
643
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
644
}\
645
\
646
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
647
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
648
}\
649
\
650
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
651
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
652
}\
653
\
654
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
655
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
656
}\
657
\
658
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
659
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
660
    int i;\
661
    for(i=0; i<h; i++){\
662
        uint32_t a, b, c, d, l0, l1, h0, h1;\
663
        a= LD32(&src1[i*src_stride1]);\
664
        b= LD32(&src2[i*src_stride2]);\
665
        c= LD32(&src3[i*src_stride3]);\
666
        d= LD32(&src4[i*src_stride4]);\
667
        l0=  (a&0x03030303UL)\
668
           + (b&0x03030303UL)\
669
           + 0x01010101UL;\
670
        h0= ((a&0xFCFCFCFCUL)>>2)\
671
          + ((b&0xFCFCFCFCUL)>>2);\
672
        l1=  (c&0x03030303UL)\
673
           + (d&0x03030303UL);\
674
        h1= ((c&0xFCFCFCFCUL)>>2)\
675
          + ((d&0xFCFCFCFCUL)>>2);\
676
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
677
        a= LD32(&src1[i*src_stride1+4]);\
678
        b= LD32(&src2[i*src_stride2+4]);\
679
        c= LD32(&src3[i*src_stride3+4]);\
680
        d= LD32(&src4[i*src_stride4+4]);\
681
        l0=  (a&0x03030303UL)\
682
           + (b&0x03030303UL)\
683
           + 0x01010101UL;\
684
        h0= ((a&0xFCFCFCFCUL)>>2)\
685
          + ((b&0xFCFCFCFCUL)>>2);\
686
        l1=  (c&0x03030303UL)\
687
           + (d&0x03030303UL);\
688
        h1= ((c&0xFCFCFCFCUL)>>2)\
689
          + ((d&0xFCFCFCFCUL)>>2);\
690
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
691
    }\
692
}\
693
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
694
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
695
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
696
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697
}\
698
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
699
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
700
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
701
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702
}\
703
\
704
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705
{\
706
        int i, a0, b0, a1, b1;\
707
        a0= pixels[0];\
708
        b0= pixels[1] + 2;\
709
        a0 += b0;\
710
        b0 += pixels[2];\
711
\
712
        pixels+=line_size;\
713
        for(i=0; i<h; i+=2){\
714
            a1= pixels[0];\
715
            b1= pixels[1];\
716
            a1 += b1;\
717
            b1 += pixels[2];\
718
\
719
            block[0]= (a1+a0)>>2; /* FIXME non put */\
720
            block[1]= (b1+b0)>>2;\
721
\
722
            pixels+=line_size;\
723
            block +=line_size;\
724
\
725
            a0= pixels[0];\
726
            b0= pixels[1] + 2;\
727
            a0 += b0;\
728
            b0 += pixels[2];\
729
\
730
            block[0]= (a1+a0)>>2;\
731
            block[1]= (b1+b0)>>2;\
732
            pixels+=line_size;\
733
            block +=line_size;\
734
        }\
735
}\
736
\
737
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
738
{\
739
        int i;\
740
        const uint32_t a= LD32(pixels  );\
741
        const uint32_t b= LD32(pixels+1);\
742
        uint32_t l0=  (a&0x03030303UL)\
743
                    + (b&0x03030303UL)\
744
                    + 0x02020202UL;\
745
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
746
                   + ((b&0xFCFCFCFCUL)>>2);\
747
        uint32_t l1,h1;\
748
\
749
        pixels+=line_size;\
750
        for(i=0; i<h; i+=2){\
751
            uint32_t a= LD32(pixels  );\
752
            uint32_t b= LD32(pixels+1);\
753
            l1=  (a&0x03030303UL)\
754
               + (b&0x03030303UL);\
755
            h1= ((a&0xFCFCFCFCUL)>>2)\
756
              + ((b&0xFCFCFCFCUL)>>2);\
757
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
758
            pixels+=line_size;\
759
            block +=line_size;\
760
            a= LD32(pixels  );\
761
            b= LD32(pixels+1);\
762
            l0=  (a&0x03030303UL)\
763
               + (b&0x03030303UL)\
764
               + 0x02020202UL;\
765
            h0= ((a&0xFCFCFCFCUL)>>2)\
766
              + ((b&0xFCFCFCFCUL)>>2);\
767
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
768
            pixels+=line_size;\
769
            block +=line_size;\
770
        }\
771
}\
772
\
773
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774
{\
775
    int j;\
776
    for(j=0; j<2; j++){\
777
        int i;\
778
        const uint32_t a= LD32(pixels  );\
779
        const uint32_t b= LD32(pixels+1);\
780
        uint32_t l0=  (a&0x03030303UL)\
781
                    + (b&0x03030303UL)\
782
                    + 0x02020202UL;\
783
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
784
                   + ((b&0xFCFCFCFCUL)>>2);\
785
        uint32_t l1,h1;\
786
\
787
        pixels+=line_size;\
788
        for(i=0; i<h; i+=2){\
789
            uint32_t a= LD32(pixels  );\
790
            uint32_t b= LD32(pixels+1);\
791
            l1=  (a&0x03030303UL)\
792
               + (b&0x03030303UL);\
793
            h1= ((a&0xFCFCFCFCUL)>>2)\
794
              + ((b&0xFCFCFCFCUL)>>2);\
795
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796
            pixels+=line_size;\
797
            block +=line_size;\
798
            a= LD32(pixels  );\
799
            b= LD32(pixels+1);\
800
            l0=  (a&0x03030303UL)\
801
               + (b&0x03030303UL)\
802
               + 0x02020202UL;\
803
            h0= ((a&0xFCFCFCFCUL)>>2)\
804
              + ((b&0xFCFCFCFCUL)>>2);\
805
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
806
            pixels+=line_size;\
807
            block +=line_size;\
808
        }\
809
        pixels+=4-line_size*(h+1);\
810
        block +=4-line_size*h;\
811
    }\
812
}\
813
\
814
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
815
{\
816
    int j;\
817
    for(j=0; j<2; j++){\
818
        int i;\
819
        const uint32_t a= LD32(pixels  );\
820
        const uint32_t b= LD32(pixels+1);\
821
        uint32_t l0=  (a&0x03030303UL)\
822
                    + (b&0x03030303UL)\
823
                    + 0x01010101UL;\
824
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
825
                   + ((b&0xFCFCFCFCUL)>>2);\
826
        uint32_t l1,h1;\
827
\
828
        pixels+=line_size;\
829
        for(i=0; i<h; i+=2){\
830
            uint32_t a= LD32(pixels  );\
831
            uint32_t b= LD32(pixels+1);\
832
            l1=  (a&0x03030303UL)\
833
               + (b&0x03030303UL);\
834
            h1= ((a&0xFCFCFCFCUL)>>2)\
835
              + ((b&0xFCFCFCFCUL)>>2);\
836
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837
            pixels+=line_size;\
838
            block +=line_size;\
839
            a= LD32(pixels  );\
840
            b= LD32(pixels+1);\
841
            l0=  (a&0x03030303UL)\
842
               + (b&0x03030303UL)\
843
               + 0x01010101UL;\
844
            h0= ((a&0xFCFCFCFCUL)>>2)\
845
              + ((b&0xFCFCFCFCUL)>>2);\
846
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847
            pixels+=line_size;\
848
            block +=line_size;\
849
        }\
850
        pixels+=4-line_size*(h+1);\
851
        block +=4-line_size*h;\
852
    }\
853
}\
854
\
855
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
856
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
857
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
858
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
859
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
860
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
861
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
862
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
863

    
864
#define op_avg(a, b) a = rnd_avg32(a, b)
865
#endif
866
#define op_put(a, b) a = b
867

    
868
PIXOP2(avg, op_avg)
869
PIXOP2(put, op_put)
870
#undef op_avg
871
#undef op_put
872

    
873
#define avg2(a,b) ((a+b+1)>>1)
874
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
875

    
876

    
877
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
878
{
879
    const int A=(16-x16)*(16-y16);
880
    const int B=(   x16)*(16-y16);
881
    const int C=(16-x16)*(   y16);
882
    const int D=(   x16)*(   y16);
883
    int i;
884

    
885
    for(i=0; i<h; i++)
886
    {
887
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
888
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
889
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
890
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
891
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
892
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
893
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
894
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
895
        dst+= stride;
896
        src+= stride;
897
    }
898
}
899

    
900
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, 
901
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
902
{
903
    int y, vx, vy;
904
    const int s= 1<<shift;
905
    
906
    width--;
907
    height--;
908

    
909
    for(y=0; y<h; y++){
910
        int x;
911

    
912
        vx= ox;
913
        vy= oy;
914
        for(x=0; x<8; x++){ //XXX FIXME optimize
915
            int src_x, src_y, frac_x, frac_y, index;
916

    
917
            src_x= vx>>16;
918
            src_y= vy>>16;
919
            frac_x= src_x&(s-1);
920
            frac_y= src_y&(s-1);
921
            src_x>>=shift;
922
            src_y>>=shift;
923
  
924
            if((unsigned)src_x < width){
925
                if((unsigned)src_y < height){
926
                    index= src_x + src_y*stride;
927
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
928
                                           + src[index       +1]*   frac_x )*(s-frac_y)
929
                                        + (  src[index+stride  ]*(s-frac_x)
930
                                           + src[index+stride+1]*   frac_x )*   frac_y
931
                                        + r)>>(shift*2);
932
                }else{
933
                    index= src_x + clip(src_y, 0, height)*stride;                    
934
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x) 
935
                                          + src[index       +1]*   frac_x )*s
936
                                        + r)>>(shift*2);
937
                }
938
            }else{
939
                if((unsigned)src_y < height){
940
                    index= clip(src_x, 0, width) + src_y*stride;                    
941
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y) 
942
                                           + src[index+stride  ]*   frac_y )*s
943
                                        + r)>>(shift*2);
944
                }else{
945
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;                    
946
                    dst[y*stride + x]=    src[index         ];
947
                }
948
            }
949
            
950
            vx+= dxx;
951
            vy+= dyx;
952
        }
953
        ox += dxy;
954
        oy += dyy;
955
    }
956
}
957

    
958
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
959
    switch(width){
960
    case 2: put_pixels2_c (dst, src, stride, height); break;
961
    case 4: put_pixels4_c (dst, src, stride, height); break;
962
    case 8: put_pixels8_c (dst, src, stride, height); break;
963
    case 16:put_pixels16_c(dst, src, stride, height); break;
964
    }
965
}
966

    
967
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
968
    int i,j;
969
    for (i=0; i < height; i++) {
970
      for (j=0; j < width; j++) {
971
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
972
      }
973
      src += stride;
974
      dst += stride;
975
    }
976
}
977

    
978
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
979
    int i,j;
980
    for (i=0; i < height; i++) {
981
      for (j=0; j < width; j++) {
982
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
983
      }
984
      src += stride;
985
      dst += stride;
986
    }
987
}
988
    
989
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
990
    int i,j;
991
    for (i=0; i < height; i++) {
992
      for (j=0; j < width; j++) {
993
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
994
      }
995
      src += stride;
996
      dst += stride;
997
    }
998
}
999
    
1000
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001
    int i,j;
1002
    for (i=0; i < height; i++) {
1003
      for (j=0; j < width; j++) {
1004
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005
      }
1006
      src += stride;
1007
      dst += stride;
1008
    }
1009
}
1010

    
1011
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012
    int i,j;
1013
    for (i=0; i < height; i++) {
1014
      for (j=0; j < width; j++) {
1015
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016
      }
1017
      src += stride;
1018
      dst += stride;
1019
    }
1020
}
1021

    
1022
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023
    int i,j;
1024
    for (i=0; i < height; i++) {
1025
      for (j=0; j < width; j++) {
1026
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027
      }
1028
      src += stride;
1029
      dst += stride;
1030
    }
1031
}
1032

    
1033
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034
    int i,j;
1035
    for (i=0; i < height; i++) {
1036
      for (j=0; j < width; j++) {
1037
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038
      }
1039
      src += stride;
1040
      dst += stride;
1041
    }
1042
}
1043

    
1044
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045
    int i,j;
1046
    for (i=0; i < height; i++) {
1047
      for (j=0; j < width; j++) {
1048
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049
      }
1050
      src += stride;
1051
      dst += stride;
1052
    }
1053
}
1054

    
1055
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056
    switch(width){
1057
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1058
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1059
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1060
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1061
    }
1062
}
1063

    
1064
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065
    int i,j;
1066
    for (i=0; i < height; i++) {
1067
      for (j=0; j < width; j++) {
1068
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069
      }
1070
      src += stride;
1071
      dst += stride;
1072
    }
1073
}
1074

    
1075
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076
    int i,j;
1077
    for (i=0; i < height; i++) {
1078
      for (j=0; j < width; j++) {
1079
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080
      }
1081
      src += stride;
1082
      dst += stride;
1083
    }
1084
}
1085
    
1086
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087
    int i,j;
1088
    for (i=0; i < height; i++) {
1089
      for (j=0; j < width; j++) {
1090
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091
      }
1092
      src += stride;
1093
      dst += stride;
1094
    }
1095
}
1096
    
1097
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098
    int i,j;
1099
    for (i=0; i < height; i++) {
1100
      for (j=0; j < width; j++) {
1101
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102
      }
1103
      src += stride;
1104
      dst += stride;
1105
    }
1106
}
1107

    
1108
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109
    int i,j;
1110
    for (i=0; i < height; i++) {
1111
      for (j=0; j < width; j++) {
1112
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113
      }
1114
      src += stride;
1115
      dst += stride;
1116
    }
1117
}
1118

    
1119
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120
    int i,j;
1121
    for (i=0; i < height; i++) {
1122
      for (j=0; j < width; j++) {
1123
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124
      }
1125
      src += stride;
1126
      dst += stride;
1127
    }
1128
}
1129

    
1130
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131
    int i,j;
1132
    for (i=0; i < height; i++) {
1133
      for (j=0; j < width; j++) {
1134
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135
      }
1136
      src += stride;
1137
      dst += stride;
1138
    }
1139
}
1140

    
1141
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142
    int i,j;
1143
    for (i=0; i < height; i++) {
1144
      for (j=0; j < width; j++) {
1145
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146
      }
1147
      src += stride;
1148
      dst += stride;
1149
    }
1150
}
1151
#if 0
1152
#define TPEL_WIDTH(width)\
1153
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171
#endif
1172

    
1173
#define H264_CHROMA_MC(OPNAME, OP)\
1174
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175
    const int A=(8-x)*(8-y);\
1176
    const int B=(  x)*(8-y);\
1177
    const int C=(8-x)*(  y);\
1178
    const int D=(  x)*(  y);\
1179
    int i;\
1180
    \
1181
    assert(x<8 && y<8 && x>=0 && y>=0);\
1182
\
1183
    for(i=0; i<h; i++)\
1184
    {\
1185
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187
        dst+= stride;\
1188
        src+= stride;\
1189
    }\
1190
}\
1191
\
1192
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193
    const int A=(8-x)*(8-y);\
1194
    const int B=(  x)*(8-y);\
1195
    const int C=(8-x)*(  y);\
1196
    const int D=(  x)*(  y);\
1197
    int i;\
1198
    \
1199
    assert(x<8 && y<8 && x>=0 && y>=0);\
1200
\
1201
    for(i=0; i<h; i++)\
1202
    {\
1203
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207
        dst+= stride;\
1208
        src+= stride;\
1209
    }\
1210
}\
1211
\
1212
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213
    const int A=(8-x)*(8-y);\
1214
    const int B=(  x)*(8-y);\
1215
    const int C=(8-x)*(  y);\
1216
    const int D=(  x)*(  y);\
1217
    int i;\
1218
    \
1219
    assert(x<8 && y<8 && x>=0 && y>=0);\
1220
\
1221
    for(i=0; i<h; i++)\
1222
    {\
1223
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231
        dst+= stride;\
1232
        src+= stride;\
1233
    }\
1234
}
1235

    
1236
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237
#define op_put(a, b) a = (((b) + 32)>>6)
1238

    
1239
H264_CHROMA_MC(put_       , op_put)
1240
H264_CHROMA_MC(avg_       , op_avg)
1241
#undef op_avg
1242
#undef op_put
1243

    
1244
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245
{
1246
    int i;
1247
    for(i=0; i<h; i++)
1248
    {
1249
        ST32(dst   , LD32(src   ));
1250
        dst+=dstStride;
1251
        src+=srcStride;
1252
    }
1253
}
1254

    
1255
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256
{
1257
    int i;
1258
    for(i=0; i<h; i++)
1259
    {
1260
        ST32(dst   , LD32(src   ));
1261
        ST32(dst+4 , LD32(src+4 ));
1262
        dst+=dstStride;
1263
        src+=srcStride;
1264
    }
1265
}
1266

    
1267
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268
{
1269
    int i;
1270
    for(i=0; i<h; i++)
1271
    {
1272
        ST32(dst   , LD32(src   ));
1273
        ST32(dst+4 , LD32(src+4 ));
1274
        ST32(dst+8 , LD32(src+8 ));
1275
        ST32(dst+12, LD32(src+12));
1276
        dst+=dstStride;
1277
        src+=srcStride;
1278
    }
1279
}
1280

    
1281
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282
{
1283
    int i;
1284
    for(i=0; i<h; i++)
1285
    {
1286
        ST32(dst   , LD32(src   ));
1287
        ST32(dst+4 , LD32(src+4 ));
1288
        ST32(dst+8 , LD32(src+8 ));
1289
        ST32(dst+12, LD32(src+12));
1290
        dst[16]= src[16];
1291
        dst+=dstStride;
1292
        src+=srcStride;
1293
    }
1294
}
1295

    
1296
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297
{
1298
    int i;
1299
    for(i=0; i<h; i++)
1300
    {
1301
        ST32(dst   , LD32(src   ));
1302
        ST32(dst+4 , LD32(src+4 ));
1303
        dst[8]= src[8];
1304
        dst+=dstStride;
1305
        src+=srcStride;
1306
    }
1307
}
1308

    
1309

    
1310
#define QPEL_MC(r, OPNAME, RND, OP) \
1311
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313
    int i;\
1314
    for(i=0; i<h; i++)\
1315
    {\
1316
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324
        dst+=dstStride;\
1325
        src+=srcStride;\
1326
    }\
1327
}\
1328
\
1329
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330
    const int w=8;\
1331
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332
    int i;\
1333
    for(i=0; i<w; i++)\
1334
    {\
1335
        const int src0= src[0*srcStride];\
1336
        const int src1= src[1*srcStride];\
1337
        const int src2= src[2*srcStride];\
1338
        const int src3= src[3*srcStride];\
1339
        const int src4= src[4*srcStride];\
1340
        const int src5= src[5*srcStride];\
1341
        const int src6= src[6*srcStride];\
1342
        const int src7= src[7*srcStride];\
1343
        const int src8= src[8*srcStride];\
1344
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352
        dst++;\
1353
        src++;\
1354
    }\
1355
}\
1356
\
1357
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359
    int i;\
1360
    \
1361
    for(i=0; i<h; i++)\
1362
    {\
1363
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379
        dst+=dstStride;\
1380
        src+=srcStride;\
1381
    }\
1382
}\
1383
\
1384
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386
    int i;\
1387
    const int w=16;\
1388
    for(i=0; i<w; i++)\
1389
    {\
1390
        const int src0= src[0*srcStride];\
1391
        const int src1= src[1*srcStride];\
1392
        const int src2= src[2*srcStride];\
1393
        const int src3= src[3*srcStride];\
1394
        const int src4= src[4*srcStride];\
1395
        const int src5= src[5*srcStride];\
1396
        const int src6= src[6*srcStride];\
1397
        const int src7= src[7*srcStride];\
1398
        const int src8= src[8*srcStride];\
1399
        const int src9= src[9*srcStride];\
1400
        const int src10= src[10*srcStride];\
1401
        const int src11= src[11*srcStride];\
1402
        const int src12= src[12*srcStride];\
1403
        const int src13= src[13*srcStride];\
1404
        const int src14= src[14*srcStride];\
1405
        const int src15= src[15*srcStride];\
1406
        const int src16= src[16*srcStride];\
1407
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423
        dst++;\
1424
        src++;\
1425
    }\
1426
}\
1427
\
1428
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1430
}\
1431
\
1432
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433
    uint8_t half[64];\
1434
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436
}\
1437
\
1438
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440
}\
1441
\
1442
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443
    uint8_t half[64];\
1444
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446
}\
1447
\
1448
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449
    uint8_t full[16*9];\
1450
    uint8_t half[64];\
1451
    copy_block9(full, src, 16, stride, 9);\
1452
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454
}\
1455
\
1456
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457
    uint8_t full[16*9];\
1458
    copy_block9(full, src, 16, stride, 9);\
1459
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460
}\
1461
\
1462
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463
    uint8_t full[16*9];\
1464
    uint8_t half[64];\
1465
    copy_block9(full, src, 16, stride, 9);\
1466
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468
}\
1469
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470
    uint8_t full[16*9];\
1471
    uint8_t halfH[72];\
1472
    uint8_t halfV[64];\
1473
    uint8_t halfHV[64];\
1474
    copy_block9(full, src, 16, stride, 9);\
1475
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479
}\
1480
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481
    uint8_t full[16*9];\
1482
    uint8_t halfH[72];\
1483
    uint8_t halfHV[64];\
1484
    copy_block9(full, src, 16, stride, 9);\
1485
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489
}\
1490
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491
    uint8_t full[16*9];\
1492
    uint8_t halfH[72];\
1493
    uint8_t halfV[64];\
1494
    uint8_t halfHV[64];\
1495
    copy_block9(full, src, 16, stride, 9);\
1496
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500
}\
1501
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502
    uint8_t full[16*9];\
1503
    uint8_t halfH[72];\
1504
    uint8_t halfHV[64];\
1505
    copy_block9(full, src, 16, stride, 9);\
1506
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510
}\
1511
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512
    uint8_t full[16*9];\
1513
    uint8_t halfH[72];\
1514
    uint8_t halfV[64];\
1515
    uint8_t halfHV[64];\
1516
    copy_block9(full, src, 16, stride, 9);\
1517
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521
}\
1522
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523
    uint8_t full[16*9];\
1524
    uint8_t halfH[72];\
1525
    uint8_t halfHV[64];\
1526
    copy_block9(full, src, 16, stride, 9);\
1527
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531
}\
1532
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533
    uint8_t full[16*9];\
1534
    uint8_t halfH[72];\
1535
    uint8_t halfV[64];\
1536
    uint8_t halfHV[64];\
1537
    copy_block9(full, src, 16, stride, 9);\
1538
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1539
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542
}\
1543
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544
    uint8_t full[16*9];\
1545
    uint8_t halfH[72];\
1546
    uint8_t halfHV[64];\
1547
    copy_block9(full, src, 16, stride, 9);\
1548
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552
}\
1553
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554
    uint8_t halfH[72];\
1555
    uint8_t halfHV[64];\
1556
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559
}\
1560
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561
    uint8_t halfH[72];\
1562
    uint8_t halfHV[64];\
1563
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566
}\
1567
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568
    uint8_t full[16*9];\
1569
    uint8_t halfH[72];\
1570
    uint8_t halfV[64];\
1571
    uint8_t halfHV[64];\
1572
    copy_block9(full, src, 16, stride, 9);\
1573
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577
}\
1578
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579
    uint8_t full[16*9];\
1580
    uint8_t halfH[72];\
1581
    copy_block9(full, src, 16, stride, 9);\
1582
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585
}\
1586
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587
    uint8_t full[16*9];\
1588
    uint8_t halfH[72];\
1589
    uint8_t halfV[64];\
1590
    uint8_t halfHV[64];\
1591
    copy_block9(full, src, 16, stride, 9);\
1592
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596
}\
1597
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598
    uint8_t full[16*9];\
1599
    uint8_t halfH[72];\
1600
    copy_block9(full, src, 16, stride, 9);\
1601
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604
}\
1605
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606
    uint8_t halfH[72];\
1607
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609
}\
1610
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1612
}\
1613
\
1614
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615
    uint8_t half[256];\
1616
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618
}\
1619
\
1620
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622
}\
1623
\
1624
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625
    uint8_t half[256];\
1626
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628
}\
1629
\
1630
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631
    uint8_t full[24*17];\
1632
    uint8_t half[256];\
1633
    copy_block17(full, src, 24, stride, 17);\
1634
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636
}\
1637
\
1638
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639
    uint8_t full[24*17];\
1640
    copy_block17(full, src, 24, stride, 17);\
1641
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642
}\
1643
\
1644
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645
    uint8_t full[24*17];\
1646
    uint8_t half[256];\
1647
    copy_block17(full, src, 24, stride, 17);\
1648
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650
}\
1651
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652
    uint8_t full[24*17];\
1653
    uint8_t halfH[272];\
1654
    uint8_t halfV[256];\
1655
    uint8_t halfHV[256];\
1656
    copy_block17(full, src, 24, stride, 17);\
1657
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661
}\
1662
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663
    uint8_t full[24*17];\
1664
    uint8_t halfH[272];\
1665
    uint8_t halfHV[256];\
1666
    copy_block17(full, src, 24, stride, 17);\
1667
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671
}\
1672
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673
    uint8_t full[24*17];\
1674
    uint8_t halfH[272];\
1675
    uint8_t halfV[256];\
1676
    uint8_t halfHV[256];\
1677
    copy_block17(full, src, 24, stride, 17);\
1678
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682
}\
1683
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684
    uint8_t full[24*17];\
1685
    uint8_t halfH[272];\
1686
    uint8_t halfHV[256];\
1687
    copy_block17(full, src, 24, stride, 17);\
1688
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692
}\
1693
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694
    uint8_t full[24*17];\
1695
    uint8_t halfH[272];\
1696
    uint8_t halfV[256];\
1697
    uint8_t halfHV[256];\
1698
    copy_block17(full, src, 24, stride, 17);\
1699
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703
}\
1704
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705
    uint8_t full[24*17];\
1706
    uint8_t halfH[272];\
1707
    uint8_t halfHV[256];\
1708
    copy_block17(full, src, 24, stride, 17);\
1709
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713
}\
1714
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715
    uint8_t full[24*17];\
1716
    uint8_t halfH[272];\
1717
    uint8_t halfV[256];\
1718
    uint8_t halfHV[256];\
1719
    copy_block17(full, src, 24, stride, 17);\
1720
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1721
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724
}\
1725
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726
    uint8_t full[24*17];\
1727
    uint8_t halfH[272];\
1728
    uint8_t halfHV[256];\
1729
    copy_block17(full, src, 24, stride, 17);\
1730
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734
}\
1735
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736
    uint8_t halfH[272];\
1737
    uint8_t halfHV[256];\
1738
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741
}\
1742
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743
    uint8_t halfH[272];\
1744
    uint8_t halfHV[256];\
1745
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748
}\
1749
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750
    uint8_t full[24*17];\
1751
    uint8_t halfH[272];\
1752
    uint8_t halfV[256];\
1753
    uint8_t halfHV[256];\
1754
    copy_block17(full, src, 24, stride, 17);\
1755
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759
}\
1760
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761
    uint8_t full[24*17];\
1762
    uint8_t halfH[272];\
1763
    copy_block17(full, src, 24, stride, 17);\
1764
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767
}\
1768
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[24*17];\
1770
    uint8_t halfH[272];\
1771
    uint8_t halfV[256];\
1772
    uint8_t halfHV[256];\
1773
    copy_block17(full, src, 24, stride, 17);\
1774
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778
}\
1779
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780
    uint8_t full[24*17];\
1781
    uint8_t halfH[272];\
1782
    copy_block17(full, src, 24, stride, 17);\
1783
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786
}\
1787
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788
    uint8_t halfH[272];\
1789
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791
}
1792

    
1793
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795
#define op_put(a, b) a = cm[((b) + 16)>>5]
1796
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797

    
1798
QPEL_MC(0, put_       , _       , op_put)
1799
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800
QPEL_MC(0, avg_       , _       , op_avg)
1801
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1802
#undef op_avg
1803
#undef op_avg_no_rnd
1804
#undef op_put
1805
#undef op_put_no_rnd
1806

    
1807
#if 1
1808
#define H264_LOWPASS(OPNAME, OP, OP2) \
1809
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810
    const int h=4;\
1811
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812
    int i;\
1813
    for(i=0; i<h; i++)\
1814
    {\
1815
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819
        dst+=dstStride;\
1820
        src+=srcStride;\
1821
    }\
1822
}\
1823
\
1824
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825
    const int w=4;\
1826
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827
    int i;\
1828
    for(i=0; i<w; i++)\
1829
    {\
1830
        const int srcB= src[-2*srcStride];\
1831
        const int srcA= src[-1*srcStride];\
1832
        const int src0= src[0 *srcStride];\
1833
        const int src1= src[1 *srcStride];\
1834
        const int src2= src[2 *srcStride];\
1835
        const int src3= src[3 *srcStride];\
1836
        const int src4= src[4 *srcStride];\
1837
        const int src5= src[5 *srcStride];\
1838
        const int src6= src[6 *srcStride];\
1839
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843
        dst++;\
1844
        src++;\
1845
    }\
1846
}\
1847
\
1848
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849
    const int h=4;\
1850
    const int w=4;\
1851
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852
    int i;\
1853
    src -= 2*srcStride;\
1854
    for(i=0; i<h+5; i++)\
1855
    {\
1856
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860
        tmp+=tmpStride;\
1861
        src+=srcStride;\
1862
    }\
1863
    tmp -= tmpStride*(h+5-2);\
1864
    for(i=0; i<w; i++)\
1865
    {\
1866
        const int tmpB= tmp[-2*tmpStride];\
1867
        const int tmpA= tmp[-1*tmpStride];\
1868
        const int tmp0= tmp[0 *tmpStride];\
1869
        const int tmp1= tmp[1 *tmpStride];\
1870
        const int tmp2= tmp[2 *tmpStride];\
1871
        const int tmp3= tmp[3 *tmpStride];\
1872
        const int tmp4= tmp[4 *tmpStride];\
1873
        const int tmp5= tmp[5 *tmpStride];\
1874
        const int tmp6= tmp[6 *tmpStride];\
1875
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879
        dst++;\
1880
        tmp++;\
1881
    }\
1882
}\
1883
\
1884
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885
    const int h=8;\
1886
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887
    int i;\
1888
    for(i=0; i<h; i++)\
1889
    {\
1890
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898
        dst+=dstStride;\
1899
        src+=srcStride;\
1900
    }\
1901
}\
1902
\
1903
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904
    const int w=8;\
1905
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906
    int i;\
1907
    for(i=0; i<w; i++)\
1908
    {\
1909
        const int srcB= src[-2*srcStride];\
1910
        const int srcA= src[-1*srcStride];\
1911
        const int src0= src[0 *srcStride];\
1912
        const int src1= src[1 *srcStride];\
1913
        const int src2= src[2 *srcStride];\
1914
        const int src3= src[3 *srcStride];\
1915
        const int src4= src[4 *srcStride];\
1916
        const int src5= src[5 *srcStride];\
1917
        const int src6= src[6 *srcStride];\
1918
        const int src7= src[7 *srcStride];\
1919
        const int src8= src[8 *srcStride];\
1920
        const int src9= src[9 *srcStride];\
1921
        const int src10=src[10*srcStride];\
1922
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930
        dst++;\
1931
        src++;\
1932
    }\
1933
}\
1934
\
1935
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936
    const int h=8;\
1937
    const int w=8;\
1938
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939
    int i;\
1940
    src -= 2*srcStride;\
1941
    for(i=0; i<h+5; i++)\
1942
    {\
1943
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951
        tmp+=tmpStride;\
1952
        src+=srcStride;\
1953
    }\
1954
    tmp -= tmpStride*(h+5-2);\
1955
    for(i=0; i<w; i++)\
1956
    {\
1957
        const int tmpB= tmp[-2*tmpStride];\
1958
        const int tmpA= tmp[-1*tmpStride];\
1959
        const int tmp0= tmp[0 *tmpStride];\
1960
        const int tmp1= tmp[1 *tmpStride];\
1961
        const int tmp2= tmp[2 *tmpStride];\
1962
        const int tmp3= tmp[3 *tmpStride];\
1963
        const int tmp4= tmp[4 *tmpStride];\
1964
        const int tmp5= tmp[5 *tmpStride];\
1965
        const int tmp6= tmp[6 *tmpStride];\
1966
        const int tmp7= tmp[7 *tmpStride];\
1967
        const int tmp8= tmp[8 *tmpStride];\
1968
        const int tmp9= tmp[9 *tmpStride];\
1969
        const int tmp10=tmp[10*tmpStride];\
1970
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978
        dst++;\
1979
        tmp++;\
1980
    }\
1981
}\
1982
\
1983
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1985
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986
    src += 8*srcStride;\
1987
    dst += 8*dstStride;\
1988
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1989
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990
}\
1991
\
1992
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1994
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995
    src += 8*srcStride;\
1996
    dst += 8*dstStride;\
1997
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1998
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999
}\
2000
\
2001
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2003
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004
    src += 8*srcStride;\
2005
    tmp += 8*tmpStride;\
2006
    dst += 8*dstStride;\
2007
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2008
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009
}\
2010

    
2011
#define H264_MC(OPNAME, SIZE) \
2012
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014
}\
2015
\
2016
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017
    uint8_t half[SIZE*SIZE];\
2018
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020
}\
2021
\
2022
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024
}\
2025
\
2026
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027
    uint8_t half[SIZE*SIZE];\
2028
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030
}\
2031
\
2032
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[SIZE*(SIZE+5)];\
2034
    uint8_t * const full_mid= full + SIZE*2;\
2035
    uint8_t half[SIZE*SIZE];\
2036
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2037
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039
}\
2040
\
2041
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[SIZE*(SIZE+5)];\
2043
    uint8_t * const full_mid= full + SIZE*2;\
2044
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2045
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046
}\
2047
\
2048
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049
    uint8_t full[SIZE*(SIZE+5)];\
2050
    uint8_t * const full_mid= full + SIZE*2;\
2051
    uint8_t half[SIZE*SIZE];\
2052
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2053
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055
}\
2056
\
2057
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[SIZE*(SIZE+5)];\
2059
    uint8_t * const full_mid= full + SIZE*2;\
2060
    uint8_t halfH[SIZE*SIZE];\
2061
    uint8_t halfV[SIZE*SIZE];\
2062
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2064
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066
}\
2067
\
2068
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069
    uint8_t full[SIZE*(SIZE+5)];\
2070
    uint8_t * const full_mid= full + SIZE*2;\
2071
    uint8_t halfH[SIZE*SIZE];\
2072
    uint8_t halfV[SIZE*SIZE];\
2073
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2075
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077
}\
2078
\
2079
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080
    uint8_t full[SIZE*(SIZE+5)];\
2081
    uint8_t * const full_mid= full + SIZE*2;\
2082
    uint8_t halfH[SIZE*SIZE];\
2083
    uint8_t halfV[SIZE*SIZE];\
2084
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2086
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088
}\
2089
\
2090
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091
    uint8_t full[SIZE*(SIZE+5)];\
2092
    uint8_t * const full_mid= full + SIZE*2;\
2093
    uint8_t halfH[SIZE*SIZE];\
2094
    uint8_t halfV[SIZE*SIZE];\
2095
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2097
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099
}\
2100
\
2101
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102
    int16_t tmp[SIZE*(SIZE+5)];\
2103
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104
}\
2105
\
2106
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    int16_t tmp[SIZE*(SIZE+5)];\
2108
    uint8_t halfH[SIZE*SIZE];\
2109
    uint8_t halfHV[SIZE*SIZE];\
2110
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113
}\
2114
\
2115
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116
    int16_t tmp[SIZE*(SIZE+5)];\
2117
    uint8_t halfH[SIZE*SIZE];\
2118
    uint8_t halfHV[SIZE*SIZE];\
2119
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122
}\
2123
\
2124
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125
    uint8_t full[SIZE*(SIZE+5)];\
2126
    uint8_t * const full_mid= full + SIZE*2;\
2127
    int16_t tmp[SIZE*(SIZE+5)];\
2128
    uint8_t halfV[SIZE*SIZE];\
2129
    uint8_t halfHV[SIZE*SIZE];\
2130
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2131
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134
}\
2135
\
2136
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137
    uint8_t full[SIZE*(SIZE+5)];\
2138
    uint8_t * const full_mid= full + SIZE*2;\
2139
    int16_t tmp[SIZE*(SIZE+5)];\
2140
    uint8_t halfV[SIZE*SIZE];\
2141
    uint8_t halfHV[SIZE*SIZE];\
2142
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2143
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146
}\
2147

    
2148
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2151
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2153

    
2154
H264_LOWPASS(put_       , op_put, op2_put)
2155
H264_LOWPASS(avg_       , op_avg, op2_avg)
2156
H264_MC(put_, 4)
2157
H264_MC(put_, 8)
2158
H264_MC(put_, 16)
2159
H264_MC(avg_, 4)
2160
H264_MC(avg_, 8)
2161
H264_MC(avg_, 16)
2162

    
2163
#undef op_avg
2164
#undef op_put
2165
#undef op2_avg
2166
#undef op2_put
2167
#endif
2168

    
2169
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171
    int i;
2172

    
2173
    for(i=0; i<h; i++){
2174
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182
        dst+=dstStride;
2183
        src+=srcStride;        
2184
    }
2185
}
2186

    
2187
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189
    int i;
2190

    
2191
    for(i=0; i<w; i++){
2192
        const int src_1= src[ -srcStride];
2193
        const int src0 = src[0          ];
2194
        const int src1 = src[  srcStride];
2195
        const int src2 = src[2*srcStride];
2196
        const int src3 = src[3*srcStride];
2197
        const int src4 = src[4*srcStride];
2198
        const int src5 = src[5*srcStride];
2199
        const int src6 = src[6*srcStride];
2200
        const int src7 = src[7*srcStride];
2201
        const int src8 = src[8*srcStride];
2202
        const int src9 = src[9*srcStride];
2203
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2205
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2206
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2207
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2208
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2209
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2210
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2211
        src++;
2212
        dst++;
2213
    }
2214
}
2215

    
2216
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217
    put_pixels8_c(dst, src, stride, 8);
2218
}
2219

    
2220
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221
    uint8_t half[64];
2222
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224
}
2225

    
2226
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228
}
2229

    
2230
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231
    uint8_t half[64];
2232
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234
}
2235

    
2236
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238
}
2239

    
2240
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241
    uint8_t halfH[88];
2242
    uint8_t halfV[64];
2243
    uint8_t halfHV[64];
2244
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248
}
2249
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250
    uint8_t halfH[88];
2251
    uint8_t halfV[64];
2252
    uint8_t halfHV[64];
2253
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257
}
2258
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259
    uint8_t halfH[88];
2260
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262
}
2263

    
2264
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2265
    int x;
2266
    const int strength= ff_h263_loop_filter_strength[qscale];
2267
    
2268
    for(x=0; x<8; x++){
2269
        int d1, d2, ad1;
2270
        int p0= src[x-2*stride];
2271
        int p1= src[x-1*stride];
2272
        int p2= src[x+0*stride];
2273
        int p3= src[x+1*stride];
2274
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2275

    
2276
        if     (d<-2*strength) d1= 0;
2277
        else if(d<-  strength) d1=-2*strength - d;
2278
        else if(d<   strength) d1= d;
2279
        else if(d< 2*strength) d1= 2*strength - d;
2280
        else                   d1= 0;
2281
        
2282
        p1 += d1;
2283
        p2 -= d1;
2284
        if(p1&256) p1= ~(p1>>31);
2285
        if(p2&256) p2= ~(p2>>31);
2286
        
2287
        src[x-1*stride] = p1;
2288
        src[x+0*stride] = p2;
2289

    
2290
        ad1= ABS(d1)>>1;
2291
        
2292
        d2= clip((p0-p3)/4, -ad1, ad1);
2293
        
2294
        src[x-2*stride] = p0 - d2;
2295
        src[x+  stride] = p3 + d2;
2296
    }
2297
}
2298

    
2299
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2300
    int y;
2301
    const int strength= ff_h263_loop_filter_strength[qscale];
2302
    
2303
    for(y=0; y<8; y++){
2304
        int d1, d2, ad1;
2305
        int p0= src[y*stride-2];
2306
        int p1= src[y*stride-1];
2307
        int p2= src[y*stride+0];
2308
        int p3= src[y*stride+1];
2309
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2310

    
2311
        if     (d<-2*strength) d1= 0;
2312
        else if(d<-  strength) d1=-2*strength - d;
2313
        else if(d<   strength) d1= d;
2314
        else if(d< 2*strength) d1= 2*strength - d;
2315
        else                   d1= 0;
2316
        
2317
        p1 += d1;
2318
        p2 -= d1;
2319
        if(p1&256) p1= ~(p1>>31);
2320
        if(p2&256) p2= ~(p2>>31);
2321
        
2322
        src[y*stride-1] = p1;
2323
        src[y*stride+0] = p2;
2324

    
2325
        ad1= ABS(d1)>>1;
2326
        
2327
        d2= clip((p0-p3)/4, -ad1, ad1);
2328
        
2329
        src[y*stride-2] = p0 - d2;
2330
        src[y*stride+1] = p3 + d2;
2331
    }
2332
}
2333

    
2334
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2335
{
2336
    int s, i;
2337

    
2338
    s = 0;
2339
    for(i=0;i<h;i++) {
2340
        s += abs(pix1[0] - pix2[0]);
2341
        s += abs(pix1[1] - pix2[1]);
2342
        s += abs(pix1[2] - pix2[2]);
2343
        s += abs(pix1[3] - pix2[3]);
2344
        s += abs(pix1[4] - pix2[4]);
2345
        s += abs(pix1[5] - pix2[5]);
2346
        s += abs(pix1[6] - pix2[6]);
2347
        s += abs(pix1[7] - pix2[7]);
2348
        s += abs(pix1[8] - pix2[8]);
2349
        s += abs(pix1[9] - pix2[9]);
2350
        s += abs(pix1[10] - pix2[10]);
2351
        s += abs(pix1[11] - pix2[11]);
2352
        s += abs(pix1[12] - pix2[12]);
2353
        s += abs(pix1[13] - pix2[13]);
2354
        s += abs(pix1[14] - pix2[14]);
2355
        s += abs(pix1[15] - pix2[15]);
2356
        pix1 += line_size;
2357
        pix2 += line_size;
2358
    }
2359
    return s;
2360
}
2361

    
2362
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2363
{
2364
    int s, i;
2365

    
2366
    s = 0;
2367
    for(i=0;i<h;i++) {
2368
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2369
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2370
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2371
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2372
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2373
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2374
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2375
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2376
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2377
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2378
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2379
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2380
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2381
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2382
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2383
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2384
        pix1 += line_size;
2385
        pix2 += line_size;
2386
    }
2387
    return s;
2388
}
2389

    
2390
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2391
{
2392
    int s, i;
2393
    uint8_t *pix3 = pix2 + line_size;
2394

    
2395
    s = 0;
2396
    for(i=0;i<h;i++) {
2397
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2398
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2399
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2400
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2401
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2402
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2403
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2404
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2405
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2406
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2407
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2408
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2409
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2410
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2411
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2412
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2413
        pix1 += line_size;
2414
        pix2 += line_size;
2415
        pix3 += line_size;
2416
    }
2417
    return s;
2418
}
2419

    
2420
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2421
{
2422
    int s, i;
2423
    uint8_t *pix3 = pix2 + line_size;
2424

    
2425
    s = 0;
2426
    for(i=0;i<h;i++) {
2427
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2428
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2429
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2430
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2431
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2432
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2433
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2434
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2435
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2436
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2437
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2438
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2439
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2440
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2441
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2442
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2443
        pix1 += line_size;
2444
        pix2 += line_size;
2445
        pix3 += line_size;
2446
    }
2447
    return s;
2448
}
2449

    
2450
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2451
{
2452
    int s, i;
2453

    
2454
    s = 0;
2455
    for(i=0;i<h;i++) {
2456
        s += abs(pix1[0] - pix2[0]);
2457
        s += abs(pix1[1] - pix2[1]);
2458
        s += abs(pix1[2] - pix2[2]);
2459
        s += abs(pix1[3] - pix2[3]);
2460
        s += abs(pix1[4] - pix2[4]);
2461
        s += abs(pix1[5] - pix2[5]);
2462
        s += abs(pix1[6] - pix2[6]);
2463
        s += abs(pix1[7] - pix2[7]);
2464
        pix1 += line_size;
2465
        pix2 += line_size;
2466
    }
2467
    return s;
2468
}
2469

    
2470
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2471
{
2472
    int s, i;
2473

    
2474
    s = 0;
2475
    for(i=0;i<h;i++) {
2476
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2477
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2478
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2479
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2480
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2481
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2482
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2483
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2484
        pix1 += line_size;
2485
        pix2 += line_size;
2486
    }
2487
    return s;
2488
}
2489

    
2490
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2491
{
2492
    int s, i;
2493
    uint8_t *pix3 = pix2 + line_size;
2494

    
2495
    s = 0;
2496
    for(i=0;i<h;i++) {
2497
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2498
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2499
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2500
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2501
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2502
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2503
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2504
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2505
        pix1 += line_size;
2506
        pix2 += line_size;
2507
        pix3 += line_size;
2508
    }
2509
    return s;
2510
}
2511

    
2512
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2513
{
2514
    int s, i;
2515
    uint8_t *pix3 = pix2 + line_size;
2516

    
2517
    s = 0;
2518
    for(i=0;i<h;i++) {
2519
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2520
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2521
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2522
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2523
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2524
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2525
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2526
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2527
        pix1 += line_size;
2528
        pix2 += line_size;
2529
        pix3 += line_size;
2530
    }
2531
    return s;
2532
}
2533

    
2534
/**
2535
 * permutes an 8x8 block.
2536
 * @param block the block which will be permuted according to the given permutation vector
2537
 * @param permutation the permutation vector
2538
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2539
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not 
2540
 *                  (inverse) permutated to scantable order!
2541
 */
2542
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2543
{
2544
    int i;
2545
    DCTELEM temp[64];
2546
    
2547
    if(last<=0) return;
2548
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2549

    
2550
    for(i=0; i<=last; i++){
2551
        const int j= scantable[i];
2552
        temp[j]= block[j];
2553
        block[j]=0;
2554
    }
2555
    
2556
    for(i=0; i<=last; i++){
2557
        const int j= scantable[i];
2558
        const int perm_j= permutation[j];
2559
        block[perm_j]= temp[j];
2560
    }
2561
}
2562

    
2563
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2564
    return 0;
2565
}
2566

    
2567
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2568
    int i;
2569
    
2570
    memset(cmp, 0, sizeof(void*)*5);
2571
        
2572
    for(i=0; i<5; i++){
2573
        switch(type&0xFF){
2574
        case FF_CMP_SAD:
2575
            cmp[i]= c->sad[i];
2576
            break;
2577
        case FF_CMP_SATD:
2578
            cmp[i]= c->hadamard8_diff[i];
2579
            break;
2580
        case FF_CMP_SSE:
2581
            cmp[i]= c->sse[i];
2582
            break;
2583
        case FF_CMP_DCT:
2584
            cmp[i]= c->dct_sad[i];
2585
            break;
2586
        case FF_CMP_PSNR:
2587
            cmp[i]= c->quant_psnr[i];
2588
            break;
2589
        case FF_CMP_BIT:
2590
            cmp[i]= c->bit[i];
2591
            break;
2592
        case FF_CMP_RD:
2593
            cmp[i]= c->rd[i];
2594
            break;
2595
        case FF_CMP_VSAD:
2596
            cmp[i]= c->vsad[i];
2597
            break;
2598
        case FF_CMP_VSSE:
2599
            cmp[i]= c->vsse[i];
2600
            break;
2601
        case FF_CMP_ZERO:
2602
            cmp[i]= zero_cmp;
2603
            break;
2604
        default:
2605
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2606
        }
2607
    }
2608
}
2609

    
2610
/**
2611
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2612
 */
2613
static void clear_blocks_c(DCTELEM *blocks)
2614
{
2615
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
2616
}
2617

    
2618
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2619
    int i;
2620
    for(i=0; i+7<w; i+=8){
2621
        dst[i+0] += src[i+0];
2622
        dst[i+1] += src[i+1];
2623
        dst[i+2] += src[i+2];
2624
        dst[i+3] += src[i+3];
2625
        dst[i+4] += src[i+4];
2626
        dst[i+5] += src[i+5];
2627
        dst[i+6] += src[i+6];
2628
        dst[i+7] += src[i+7];
2629
    }
2630
    for(; i<w; i++)
2631
        dst[i+0] += src[i+0];
2632
}
2633

    
2634
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2635
    int i;
2636
    for(i=0; i+7<w; i+=8){
2637
        dst[i+0] = src1[i+0]-src2[i+0];
2638
        dst[i+1] = src1[i+1]-src2[i+1];
2639
        dst[i+2] = src1[i+2]-src2[i+2];
2640
        dst[i+3] = src1[i+3]-src2[i+3];
2641
        dst[i+4] = src1[i+4]-src2[i+4];
2642
        dst[i+5] = src1[i+5]-src2[i+5];
2643
        dst[i+6] = src1[i+6]-src2[i+6];
2644
        dst[i+7] = src1[i+7]-src2[i+7];
2645
    }
2646
    for(; i<w; i++)
2647
        dst[i+0] = src1[i+0]-src2[i+0];
2648
}
2649

    
2650
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2651
    int i;
2652
    uint8_t l, lt;
2653

    
2654
    l= *left;
2655
    lt= *left_top;
2656

    
2657
    for(i=0; i<w; i++){
2658
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2659
        lt= src1[i];
2660
        l= src2[i];
2661
        dst[i]= l - pred;
2662
    }    
2663

    
2664
    *left= l;
2665
    *left_top= lt;
2666
}
2667

    
2668
#define BUTTERFLY2(o1,o2,i1,i2) \
2669
o1= (i1)+(i2);\
2670
o2= (i1)-(i2);
2671

    
2672
#define BUTTERFLY1(x,y) \
2673
{\
2674
    int a,b;\
2675
    a= x;\
2676
    b= y;\
2677
    x= a+b;\
2678
    y= a-b;\
2679
}
2680

    
2681
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2682

    
2683
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2684
    int i;
2685
    int temp[64];
2686
    int sum=0;
2687
    
2688
    assert(h==8);
2689

    
2690
    for(i=0; i<8; i++){
2691
        //FIXME try pointer walks
2692
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2693
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2694
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2695
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2696
        
2697
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2698
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2699
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2700
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2701
        
2702
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2703
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2704
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2705
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2706
    }
2707

    
2708
    for(i=0; i<8; i++){
2709
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2710
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2711
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2712
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2713
        
2714
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2715
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2716
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2717
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2718

    
2719
        sum += 
2720
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2721
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2722
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2723
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2724
    }
2725
#if 0
2726
static int maxi=0;
2727
if(sum>maxi){
2728
    maxi=sum;
2729
    printf("MAX:%d\n", maxi);
2730
}
2731
#endif
2732
    return sum;
2733
}
2734

    
2735
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2736
    int i;
2737
    int temp[64];
2738
    int sum=0;
2739
    
2740
    assert(h==8);
2741
    
2742
    for(i=0; i<8; i++){
2743
        //FIXME try pointer walks
2744
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2745
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2746
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2747
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2748
        
2749
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2750
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2751
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2752
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2753
        
2754
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2755
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2756
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2757
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2758
    }
2759

    
2760
    for(i=0; i<8; i++){
2761
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2762
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2763
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2764
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2765
        
2766
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2767
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2768
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2769
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2770
    
2771
        sum += 
2772
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2773
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2774
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2775
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2776
    }
2777
    
2778
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2779
    
2780
    return sum;
2781
}
2782

    
2783
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2784
    MpegEncContext * const s= (MpegEncContext *)c;
2785
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2786
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2787
    int sum=0, i;
2788
    
2789
    assert(h==8);
2790

    
2791
    s->dsp.diff_pixels(temp, src1, src2, stride);
2792
    s->dsp.fdct(temp);
2793

    
2794
    for(i=0; i<64; i++)
2795
        sum+= ABS(temp[i]);
2796
        
2797
    return sum;
2798
}
2799

    
2800
void simple_idct(DCTELEM *block); //FIXME
2801

    
2802
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2803
    MpegEncContext * const s= (MpegEncContext *)c;
2804
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2805
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2806
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2807
    int sum=0, i;
2808

    
2809
    assert(h==8);
2810
    s->mb_intra=0;
2811
    
2812
    s->dsp.diff_pixels(temp, src1, src2, stride);
2813
    
2814
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2815
    
2816
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2817
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2818
    simple_idct(temp); //FIXME 
2819
    
2820
    for(i=0; i<64; i++)
2821
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2822
        
2823
    return sum;
2824
}
2825

    
2826
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2827
    MpegEncContext * const s= (MpegEncContext *)c;
2828
    const uint8_t *scantable= s->intra_scantable.permutated;
2829
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2830
    uint64_t __align8 aligned_bak[stride];
2831
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2832
    uint8_t * const bak= (uint8_t*)aligned_bak;
2833
    int i, last, run, bits, level, distoration, start_i;
2834
    const int esc_length= s->ac_esc_length;
2835
    uint8_t * length;
2836
    uint8_t * last_length;
2837
    
2838
    assert(h==8);
2839

    
2840
    for(i=0; i<8; i++){
2841
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2842
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2843
    }
2844

    
2845
    s->dsp.diff_pixels(temp, src1, src2, stride);
2846

    
2847
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2848

    
2849
    bits=0;
2850
    
2851
    if (s->mb_intra) {
2852
        start_i = 1; 
2853
        length     = s->intra_ac_vlc_length;
2854
        last_length= s->intra_ac_vlc_last_length;
2855
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2856
    } else {
2857
        start_i = 0;
2858
        length     = s->inter_ac_vlc_length;
2859
        last_length= s->inter_ac_vlc_last_length;
2860
    }
2861
    
2862
    if(last>=start_i){
2863
        run=0;
2864
        for(i=start_i; i<last; i++){
2865
            int j= scantable[i];
2866
            level= temp[j];
2867
        
2868
            if(level){
2869
                level+=64;
2870
                if((level&(~127)) == 0){
2871
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2872
                }else
2873
                    bits+= esc_length;
2874
                run=0;
2875
            }else
2876
                run++;
2877
        }
2878
        i= scantable[last];
2879
       
2880
        level= temp[i] + 64;
2881

    
2882
        assert(level - 64);
2883
        
2884
        if((level&(~127)) == 0){
2885
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2886
        }else
2887
            bits+= esc_length;
2888
    
2889
    }
2890

    
2891
    if(last>=0){
2892
        if(s->mb_intra)
2893
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2894
        else
2895
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2896
    }
2897
    
2898
    s->dsp.idct_add(bak, stride, temp);
2899
    
2900
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2901

    
2902
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2903
}
2904

    
2905
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2906
    MpegEncContext * const s= (MpegEncContext *)c;
2907
    const uint8_t *scantable= s->intra_scantable.permutated;
2908
    uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2909
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
2910
    int i, last, run, bits, level, start_i;
2911
    const int esc_length= s->ac_esc_length;
2912
    uint8_t * length;
2913
    uint8_t * last_length;
2914

    
2915
    assert(h==8);
2916
    
2917
    s->dsp.diff_pixels(temp, src1, src2, stride);
2918

    
2919
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2920

    
2921
    bits=0;
2922
    
2923
    if (s->mb_intra) {
2924
        start_i = 1; 
2925
        length     = s->intra_ac_vlc_length;
2926
        last_length= s->intra_ac_vlc_last_length;
2927
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2928
    } else {
2929
        start_i = 0;
2930
        length     = s->inter_ac_vlc_length;
2931
        last_length= s->inter_ac_vlc_last_length;
2932
    }
2933
    
2934
    if(last>=start_i){
2935
        run=0;
2936
        for(i=start_i; i<last; i++){
2937
            int j= scantable[i];
2938
            level= temp[j];
2939
        
2940
            if(level){
2941
                level+=64;
2942
                if((level&(~127)) == 0){
2943
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2944
                }else
2945
                    bits+= esc_length;
2946
                run=0;
2947
            }else
2948
                run++;
2949
        }
2950
        i= scantable[last];
2951
                
2952
        level= temp[i] + 64;
2953
        
2954
        assert(level - 64);
2955
        
2956
        if((level&(~127)) == 0){
2957
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2958
        }else
2959
            bits+= esc_length;
2960
    }
2961

    
2962
    return bits;
2963
}
2964

    
2965
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2966
    int score=0;
2967
    int x,y;
2968
    
2969
    for(y=1; y<h; y++){
2970
        for(x=0; x<16; x+=4){
2971
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride]) 
2972
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
2973
        }
2974
        s+= stride;
2975
    }
2976
    
2977
    return score;
2978
}
2979

    
2980
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2981
    int score=0;
2982
    int x,y;
2983
    
2984
    for(y=1; y<h; y++){
2985
        for(x=0; x<16; x++){
2986
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2987
        }
2988
        s1+= stride;
2989
        s2+= stride;
2990
    }
2991
    
2992
    return score;
2993
}
2994

    
2995
#define SQ(a) ((a)*(a))
2996
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2997
    int score=0;
2998
    int x,y;
2999
    
3000
    for(y=1; y<h; y++){
3001
        for(x=0; x<16; x+=4){
3002
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
3003
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3004
        }
3005
        s+= stride;
3006
    }
3007
    
3008
    return score;
3009
}
3010

    
3011
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3012
    int score=0;
3013
    int x,y;
3014
    
3015
    for(y=1; y<h; y++){
3016
        for(x=0; x<16; x++){
3017
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3018
        }
3019
        s1+= stride;
3020
        s2+= stride;
3021
    }
3022
    
3023
    return score;
3024
}
3025

    
3026
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3027
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3028
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3029
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3030
WARPER8_16_SQ(rd8x8_c, rd16_c)
3031
WARPER8_16_SQ(bit8x8_c, bit16_c)
3032

    
3033
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3034
 converted */
3035
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3036
{
3037
    j_rev_dct (block);
3038
    put_pixels_clamped_c(block, dest, line_size);
3039
}
3040
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3041
{
3042
    j_rev_dct (block);
3043
    add_pixels_clamped_c(block, dest, line_size);
3044
}
3045

    
3046
/* init static data */
3047
void dsputil_static_init(void)
3048
{
3049
    int i;
3050

    
3051
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3052
    for(i=0;i<MAX_NEG_CROP;i++) {
3053
        cropTbl[i] = 0;
3054
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3055
    }
3056
    
3057
    for(i=0;i<512;i++) {
3058
        squareTbl[i] = (i - 256) * (i - 256);
3059
    }
3060
    
3061
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3062
}
3063

    
3064

    
3065
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3066
{
3067
    int i;
3068

    
3069
#ifdef CONFIG_ENCODERS
3070
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3071
        c->fdct = fdct_ifast;
3072
        c->fdct248 = fdct_ifast248;
3073
    } 
3074
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3075
        c->fdct = ff_faandct;
3076
        c->fdct248 = ff_faandct248; 
3077
    } 
3078
    else {
3079
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3080
        c->fdct248 = ff_fdct248_islow;
3081
    }
3082
#endif //CONFIG_ENCODERS
3083

    
3084
    if(avctx->idct_algo==FF_IDCT_INT){
3085
        c->idct_put= ff_jref_idct_put;
3086
        c->idct_add= ff_jref_idct_add;
3087
        c->idct    = j_rev_dct;
3088
        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3089
    }else{ //accurate/default
3090
        c->idct_put= simple_idct_put;
3091
        c->idct_add= simple_idct_add;
3092
        c->idct    = simple_idct;
3093
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3094
    }
3095

    
3096
    c->get_pixels = get_pixels_c;
3097
    c->diff_pixels = diff_pixels_c;
3098
    c->put_pixels_clamped = put_pixels_clamped_c;
3099
    c->add_pixels_clamped = add_pixels_clamped_c;
3100
    c->gmc1 = gmc1_c;
3101
    c->gmc = gmc_c;
3102
    c->clear_blocks = clear_blocks_c;
3103
    c->pix_sum = pix_sum_c;
3104
    c->pix_norm1 = pix_norm1_c;
3105

    
3106
    /* TODO [0] 16  [1] 8 */
3107
    c->pix_abs[0][0] = pix_abs16_c;
3108
    c->pix_abs[0][1] = pix_abs16_x2_c;
3109
    c->pix_abs[0][2] = pix_abs16_y2_c;
3110
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3111
    c->pix_abs[1][0] = pix_abs8_c;
3112
    c->pix_abs[1][1] = pix_abs8_x2_c;
3113
    c->pix_abs[1][2] = pix_abs8_y2_c;
3114
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3115

    
3116
#define dspfunc(PFX, IDX, NUM) \
3117
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3118
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3119
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3120
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3121

    
3122
    dspfunc(put, 0, 16);
3123
    dspfunc(put_no_rnd, 0, 16);
3124
    dspfunc(put, 1, 8);
3125
    dspfunc(put_no_rnd, 1, 8);
3126
    dspfunc(put, 2, 4);
3127
    dspfunc(put, 3, 2);
3128

    
3129
    dspfunc(avg, 0, 16);
3130
    dspfunc(avg_no_rnd, 0, 16);
3131
    dspfunc(avg, 1, 8);
3132
    dspfunc(avg_no_rnd, 1, 8);
3133
    dspfunc(avg, 2, 4);
3134
    dspfunc(avg, 3, 2);
3135
#undef dspfunc
3136

    
3137
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3138
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3139
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3140
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3141
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3142
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3143
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3144
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3145
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3146

    
3147
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3148
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3149
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3150
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3151
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3152
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3153
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3154
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3155
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3156

    
3157
#define dspfunc(PFX, IDX, NUM) \
3158
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3159
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3160
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3161
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3162
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3163
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3164
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3165
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3166
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3167
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3168
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3169
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3170
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3171
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3172
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3173
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3174

    
3175
    dspfunc(put_qpel, 0, 16);
3176
    dspfunc(put_no_rnd_qpel, 0, 16);
3177

    
3178
    dspfunc(avg_qpel, 0, 16);
3179
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3180

    
3181
    dspfunc(put_qpel, 1, 8);
3182
    dspfunc(put_no_rnd_qpel, 1, 8);
3183

    
3184
    dspfunc(avg_qpel, 1, 8);
3185
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3186

    
3187
    dspfunc(put_h264_qpel, 0, 16);
3188
    dspfunc(put_h264_qpel, 1, 8);
3189
    dspfunc(put_h264_qpel, 2, 4);
3190
    dspfunc(avg_h264_qpel, 0, 16);
3191
    dspfunc(avg_h264_qpel, 1, 8);
3192
    dspfunc(avg_h264_qpel, 2, 4);
3193

    
3194
#undef dspfunc
3195
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3196
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3197
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3198
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3199
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3200
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3201

    
3202
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3203
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3204
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3205
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3206
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3207
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3208
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3209
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3210
        
3211
#define SET_CMP_FUNC(name) \
3212
    c->name[0]= name ## 16_c;\
3213
    c->name[1]= name ## 8x8_c;
3214
    
3215
    SET_CMP_FUNC(hadamard8_diff)
3216
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3217
    SET_CMP_FUNC(dct_sad)
3218
    c->sad[0]= pix_abs16_c;
3219
    c->sad[1]= pix_abs8_c;
3220
    c->sse[0]= sse16_c;
3221
    c->sse[1]= sse8_c;
3222
    SET_CMP_FUNC(quant_psnr)
3223
    SET_CMP_FUNC(rd)
3224
    SET_CMP_FUNC(bit)
3225
    c->vsad[0]= vsad16_c;
3226
    c->vsad[4]= vsad_intra16_c;
3227
    c->vsse[0]= vsse16_c;
3228
    c->vsse[4]= vsse_intra16_c;
3229
        
3230
    c->add_bytes= add_bytes_c;
3231
    c->diff_bytes= diff_bytes_c;
3232
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3233
    c->bswap_buf= bswap_buf;
3234
    
3235
    c->h263_h_loop_filter= h263_h_loop_filter_c;
3236
    c->h263_v_loop_filter= h263_v_loop_filter_c;
3237

    
3238
#ifdef HAVE_MMX
3239
    dsputil_init_mmx(c, avctx);
3240
#endif
3241
#ifdef ARCH_ARMV4L
3242
    dsputil_init_armv4l(c, avctx);
3243
#endif
3244
#ifdef HAVE_MLIB
3245
    dsputil_init_mlib(c, avctx);
3246
#endif
3247
#ifdef ARCH_ALPHA
3248
    dsputil_init_alpha(c, avctx);
3249
#endif
3250
#ifdef ARCH_POWERPC
3251
    dsputil_init_ppc(c, avctx);
3252
#endif
3253
#ifdef HAVE_MMI
3254
    dsputil_init_mmi(c, avctx);
3255
#endif
3256
#ifdef ARCH_SH4
3257
    dsputil_init_sh4(c,avctx);
3258
#endif
3259

    
3260
    switch(c->idct_permutation_type){
3261
    case FF_NO_IDCT_PERM:
3262
        for(i=0; i<64; i++)
3263
            c->idct_permutation[i]= i;
3264
        break;
3265
    case FF_LIBMPEG2_IDCT_PERM:
3266
        for(i=0; i<64; i++)
3267
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3268
        break;
3269
    case FF_SIMPLE_IDCT_PERM:
3270
        for(i=0; i<64; i++)
3271
            c->idct_permutation[i]= simple_mmx_permutation[i];
3272
        break;
3273
    case FF_TRANSPOSE_IDCT_PERM:
3274
        for(i=0; i<64; i++)
3275
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3276
        break;
3277
    default:
3278
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3279
    }
3280
}
3281