Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 68b51e58

History | View | Annotate | Download (144 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22

    
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27

    
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33

    
34
/* snow.c */
35
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36

    
37
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38
uint32_t squareTbl[512] = {0, };
39

    
40
const uint8_t ff_zigzag_direct[64] = {
41
    0,   1,  8, 16,  9,  2,  3, 10,
42
    17, 24, 32, 25, 18, 11,  4,  5,
43
    12, 19, 26, 33, 40, 48, 41, 34,
44
    27, 20, 13,  6,  7, 14, 21, 28,
45
    35, 42, 49, 56, 57, 50, 43, 36,
46
    29, 22, 15, 23, 30, 37, 44, 51,
47
    58, 59, 52, 45, 38, 31, 39, 46,
48
    53, 60, 61, 54, 47, 55, 62, 63
49
};
50

    
51
/* Specific zigzag scan for 248 idct. NOTE that unlike the
52
   specification, we interleave the fields */
53
const uint8_t ff_zigzag248_direct[64] = {
54
     0,  8,  1,  9, 16, 24,  2, 10,
55
    17, 25, 32, 40, 48, 56, 33, 41,
56
    18, 26,  3, 11,  4, 12, 19, 27,
57
    34, 42, 49, 57, 50, 58, 35, 43,
58
    20, 28,  5, 13,  6, 14, 21, 29,
59
    36, 44, 51, 59, 52, 60, 37, 45,
60
    22, 30,  7, 15, 23, 31, 38, 46,
61
    53, 61, 54, 62, 39, 47, 55, 63,
62
};
63

    
64
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
66

    
67
const uint8_t ff_alternate_horizontal_scan[64] = {
68
    0,  1,   2,  3,  8,  9, 16, 17,
69
    10, 11,  4,  5,  6,  7, 15, 14,
70
    13, 12, 19, 18, 24, 25, 32, 33,
71
    26, 27, 20, 21, 22, 23, 28, 29,
72
    30, 31, 34, 35, 40, 41, 48, 49,
73
    42, 43, 36, 37, 38, 39, 44, 45,
74
    46, 47, 50, 51, 56, 57, 58, 59,
75
    52, 53, 54, 55, 60, 61, 62, 63,
76
};
77

    
78
const uint8_t ff_alternate_vertical_scan[64] = {
79
    0,  8,  16, 24,  1,  9,  2, 10,
80
    17, 25, 32, 40, 48, 56, 57, 49,
81
    41, 33, 26, 18,  3, 11,  4, 12,
82
    19, 27, 34, 42, 50, 58, 35, 43,
83
    51, 59, 20, 28,  5, 13,  6, 14,
84
    21, 29, 36, 44, 52, 60, 37, 45,
85
    53, 61, 22, 30,  7, 15, 23, 31,
86
    38, 46, 54, 62, 39, 47, 55, 63,
87
};
88

    
89
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90
const uint32_t inverse[256]={
91
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
92
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
93
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
94
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
95
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
96
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
97
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
98
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
99
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
100
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
101
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
102
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
103
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
104
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
105
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
106
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
107
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
108
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
109
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
110
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
111
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
112
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
113
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
114
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
115
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
116
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
117
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
118
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
119
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
120
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
121
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
122
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
123
};
124

    
125
/* Input permutation for the simple_idct_mmx */
126
static const uint8_t simple_mmx_permutation[64]={
127
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
135
};
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= bswap_32(src[i+0]);
212
        dst[i+1]= bswap_32(src[i+1]);
213
        dst[i+2]= bswap_32(src[i+2]);
214
        dst[i+3]= bswap_32(src[i+3]);
215
        dst[i+4]= bswap_32(src[i+4]);
216
        dst[i+5]= bswap_32(src[i+5]);
217
        dst[i+6]= bswap_32(src[i+6]);
218
        dst[i+7]= bswap_32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= bswap_32(src[i+0]);
222
    }
223
}
224

    
225
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226
{
227
    int s, i;
228
    uint32_t *sq = squareTbl + 256;
229

    
230
    s = 0;
231
    for (i = 0; i < h; i++) {
232
        s += sq[pix1[0] - pix2[0]];
233
        s += sq[pix1[1] - pix2[1]];
234
        s += sq[pix1[2] - pix2[2]];
235
        s += sq[pix1[3] - pix2[3]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        s += sq[pix1[4] - pix2[4]];
254
        s += sq[pix1[5] - pix2[5]];
255
        s += sq[pix1[6] - pix2[6]];
256
        s += sq[pix1[7] - pix2[7]];
257
        pix1 += line_size;
258
        pix2 += line_size;
259
    }
260
    return s;
261
}
262

    
263
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[ 0] - pix2[ 0]];
271
        s += sq[pix1[ 1] - pix2[ 1]];
272
        s += sq[pix1[ 2] - pix2[ 2]];
273
        s += sq[pix1[ 3] - pix2[ 3]];
274
        s += sq[pix1[ 4] - pix2[ 4]];
275
        s += sq[pix1[ 5] - pix2[ 5]];
276
        s += sq[pix1[ 6] - pix2[ 6]];
277
        s += sq[pix1[ 7] - pix2[ 7]];
278
        s += sq[pix1[ 8] - pix2[ 8]];
279
        s += sq[pix1[ 9] - pix2[ 9]];
280
        s += sq[pix1[10] - pix2[10]];
281
        s += sq[pix1[11] - pix2[11]];
282
        s += sq[pix1[12] - pix2[12]];
283
        s += sq[pix1[13] - pix2[13]];
284
        s += sq[pix1[14] - pix2[14]];
285
        s += sq[pix1[15] - pix2[15]];
286

    
287
        pix1 += line_size;
288
        pix2 += line_size;
289
    }
290
    return s;
291
}
292

    
293

    
294
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295
#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
296
    int s, i, j;
297
    const int dec_count= w==8 ? 3 : 4;
298
    int tmp[16*16];
299
#if 0
300
    int level, ori;
301
    static const int scale[2][2][4][4]={
302
      {
303
        {
304
            //8x8 dec=3
305
            {268, 239, 239, 213},
306
            {  0, 224, 224, 152},
307
            {  0, 135, 135, 110},
308
        },{
309
            //16x16 dec=4
310
            {344, 310, 310, 280},
311
            {  0, 320, 320, 228},
312
            {  0, 175, 175, 136},
313
            {  0, 129, 129, 102},
314
        }
315
      },{
316
        {//FIXME 5/3
317
            //8x8 dec=3
318
            {275, 245, 245, 218},
319
            {  0, 230, 230, 156},
320
            {  0, 138, 138, 113},
321
        },{
322
            //16x16 dec=4
323
            {352, 317, 317, 286},
324
            {  0, 328, 328, 233},
325
            {  0, 180, 180, 140},
326
            {  0, 132, 132, 105},
327
        }
328
      }
329
    };
330
#endif
331

    
332
    for (i = 0; i < h; i++) {
333
        for (j = 0; j < w; j+=4) {
334
            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335
            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336
            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337
            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338
        }
339
        pix1 += line_size;
340
        pix2 += line_size;
341
    }
342

    
343
    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
344

    
345
    s=0;
346
#if 0
347
    for(level=0; level<dec_count; level++){
348
        for(ori= level ? 1 : 0; ori<4; ori++){
349
            int sx= (ori&1) ? 1<<level: 0;
350
            int stride= 16<<(dec_count-level);
351
            int sy= (ori&2) ? stride>>1 : 0;
352
            int size= 1<<level;
353

354
            for(i=0; i<size; i++){
355
                for(j=0; j<size; j++){
356
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
357
                    s += ABS(v);
358
                }
359
            }
360
        }
361
    }
362
#endif
363
    for (i = 0; i < h; i++) {
364
        for (j = 0; j < w; j+=4) {
365
            s+= ABS(tmp[16*i+j+0]);
366
            s+= ABS(tmp[16*i+j+1]);
367
            s+= ABS(tmp[16*i+j+2]);
368
            s+= ABS(tmp[16*i+j+3]);
369
        }
370
    }
371
    assert(s>=0);
372

    
373
    return s>>2;
374
#endif
375
}
376

    
377
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
379
}
380

    
381
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
383
}
384

    
385
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
387
}
388

    
389
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
391
}
392

    
393
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394
{
395
    int i;
396

    
397
    /* read the pixels */
398
    for(i=0;i<8;i++) {
399
        block[0] = pixels[0];
400
        block[1] = pixels[1];
401
        block[2] = pixels[2];
402
        block[3] = pixels[3];
403
        block[4] = pixels[4];
404
        block[5] = pixels[5];
405
        block[6] = pixels[6];
406
        block[7] = pixels[7];
407
        pixels += line_size;
408
        block += 8;
409
    }
410
}
411

    
412
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413
                          const uint8_t *s2, int stride){
414
    int i;
415

    
416
    /* read the pixels */
417
    for(i=0;i<8;i++) {
418
        block[0] = s1[0] - s2[0];
419
        block[1] = s1[1] - s2[1];
420
        block[2] = s1[2] - s2[2];
421
        block[3] = s1[3] - s2[3];
422
        block[4] = s1[4] - s2[4];
423
        block[5] = s1[5] - s2[5];
424
        block[6] = s1[6] - s2[6];
425
        block[7] = s1[7] - s2[7];
426
        s1 += stride;
427
        s2 += stride;
428
        block += 8;
429
    }
430
}
431

    
432

    
433
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434
                                 int line_size)
435
{
436
    int i;
437
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
438

    
439
    /* read the pixels */
440
    for(i=0;i<8;i++) {
441
        pixels[0] = cm[block[0]];
442
        pixels[1] = cm[block[1]];
443
        pixels[2] = cm[block[2]];
444
        pixels[3] = cm[block[3]];
445
        pixels[4] = cm[block[4]];
446
        pixels[5] = cm[block[5]];
447
        pixels[6] = cm[block[6]];
448
        pixels[7] = cm[block[7]];
449

    
450
        pixels += line_size;
451
        block += 8;
452
    }
453
}
454

    
455
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456
                                 int line_size)
457
{
458
    int i;
459
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
460

    
461
    /* read the pixels */
462
    for(i=0;i<4;i++) {
463
        pixels[0] = cm[block[0]];
464
        pixels[1] = cm[block[1]];
465
        pixels[2] = cm[block[2]];
466
        pixels[3] = cm[block[3]];
467

    
468
        pixels += line_size;
469
        block += 8;
470
    }
471
}
472

    
473
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474
                                 int line_size)
475
{
476
    int i;
477
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
478

    
479
    /* read the pixels */
480
    for(i=0;i<2;i++) {
481
        pixels[0] = cm[block[0]];
482
        pixels[1] = cm[block[1]];
483

    
484
        pixels += line_size;
485
        block += 8;
486
    }
487
}
488

    
489
static void put_signed_pixels_clamped_c(const DCTELEM *block,
490
                                        uint8_t *restrict pixels,
491
                                        int line_size)
492
{
493
    int i, j;
494

    
495
    for (i = 0; i < 8; i++) {
496
        for (j = 0; j < 8; j++) {
497
            if (*block < -128)
498
                *pixels = 0;
499
            else if (*block > 127)
500
                *pixels = 255;
501
            else
502
                *pixels = (uint8_t)(*block + 128);
503
            block++;
504
            pixels++;
505
        }
506
        pixels += (line_size - 8);
507
    }
508
}
509

    
510
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511
                          int line_size)
512
{
513
    int i;
514
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
515

    
516
    /* read the pixels */
517
    for(i=0;i<8;i++) {
518
        pixels[0] = cm[pixels[0] + block[0]];
519
        pixels[1] = cm[pixels[1] + block[1]];
520
        pixels[2] = cm[pixels[2] + block[2]];
521
        pixels[3] = cm[pixels[3] + block[3]];
522
        pixels[4] = cm[pixels[4] + block[4]];
523
        pixels[5] = cm[pixels[5] + block[5]];
524
        pixels[6] = cm[pixels[6] + block[6]];
525
        pixels[7] = cm[pixels[7] + block[7]];
526
        pixels += line_size;
527
        block += 8;
528
    }
529
}
530

    
531
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532
                          int line_size)
533
{
534
    int i;
535
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
536

    
537
    /* read the pixels */
538
    for(i=0;i<4;i++) {
539
        pixels[0] = cm[pixels[0] + block[0]];
540
        pixels[1] = cm[pixels[1] + block[1]];
541
        pixels[2] = cm[pixels[2] + block[2]];
542
        pixels[3] = cm[pixels[3] + block[3]];
543
        pixels += line_size;
544
        block += 8;
545
    }
546
}
547

    
548
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549
                          int line_size)
550
{
551
    int i;
552
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
553

    
554
    /* read the pixels */
555
    for(i=0;i<2;i++) {
556
        pixels[0] = cm[pixels[0] + block[0]];
557
        pixels[1] = cm[pixels[1] + block[1]];
558
        pixels += line_size;
559
        block += 8;
560
    }
561
}
562

    
563
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
564
{
565
    int i;
566
    for(i=0;i<8;i++) {
567
        pixels[0] += block[0];
568
        pixels[1] += block[1];
569
        pixels[2] += block[2];
570
        pixels[3] += block[3];
571
        pixels[4] += block[4];
572
        pixels[5] += block[5];
573
        pixels[6] += block[6];
574
        pixels[7] += block[7];
575
        pixels += line_size;
576
        block += 8;
577
    }
578
}
579

    
580
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
581
{
582
    int i;
583
    for(i=0;i<4;i++) {
584
        pixels[0] += block[0];
585
        pixels[1] += block[1];
586
        pixels[2] += block[2];
587
        pixels[3] += block[3];
588
        pixels += line_size;
589
        block += 4;
590
    }
591
}
592

    
593
#if 0
594

595
#define PIXOP2(OPNAME, OP) \
596
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
597
{\
598
    int i;\
599
    for(i=0; i<h; i++){\
600
        OP(*((uint64_t*)block), LD64(pixels));\
601
        pixels+=line_size;\
602
        block +=line_size;\
603
    }\
604
}\
605
\
606
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
607
{\
608
    int i;\
609
    for(i=0; i<h; i++){\
610
        const uint64_t a= LD64(pixels  );\
611
        const uint64_t b= LD64(pixels+1);\
612
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
613
        pixels+=line_size;\
614
        block +=line_size;\
615
    }\
616
}\
617
\
618
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
619
{\
620
    int i;\
621
    for(i=0; i<h; i++){\
622
        const uint64_t a= LD64(pixels  );\
623
        const uint64_t b= LD64(pixels+1);\
624
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
625
        pixels+=line_size;\
626
        block +=line_size;\
627
    }\
628
}\
629
\
630
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
631
{\
632
    int i;\
633
    for(i=0; i<h; i++){\
634
        const uint64_t a= LD64(pixels          );\
635
        const uint64_t b= LD64(pixels+line_size);\
636
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
637
        pixels+=line_size;\
638
        block +=line_size;\
639
    }\
640
}\
641
\
642
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
643
{\
644
    int i;\
645
    for(i=0; i<h; i++){\
646
        const uint64_t a= LD64(pixels          );\
647
        const uint64_t b= LD64(pixels+line_size);\
648
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
649
        pixels+=line_size;\
650
        block +=line_size;\
651
    }\
652
}\
653
\
654
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
655
{\
656
        int i;\
657
        const uint64_t a= LD64(pixels  );\
658
        const uint64_t b= LD64(pixels+1);\
659
        uint64_t l0=  (a&0x0303030303030303ULL)\
660
                    + (b&0x0303030303030303ULL)\
661
                    + 0x0202020202020202ULL;\
662
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
663
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
664
        uint64_t l1,h1;\
665
\
666
        pixels+=line_size;\
667
        for(i=0; i<h; i+=2){\
668
            uint64_t a= LD64(pixels  );\
669
            uint64_t b= LD64(pixels+1);\
670
            l1=  (a&0x0303030303030303ULL)\
671
               + (b&0x0303030303030303ULL);\
672
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
675
            pixels+=line_size;\
676
            block +=line_size;\
677
            a= LD64(pixels  );\
678
            b= LD64(pixels+1);\
679
            l0=  (a&0x0303030303030303ULL)\
680
               + (b&0x0303030303030303ULL)\
681
               + 0x0202020202020202ULL;\
682
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
685
            pixels+=line_size;\
686
            block +=line_size;\
687
        }\
688
}\
689
\
690
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
691
{\
692
        int i;\
693
        const uint64_t a= LD64(pixels  );\
694
        const uint64_t b= LD64(pixels+1);\
695
        uint64_t l0=  (a&0x0303030303030303ULL)\
696
                    + (b&0x0303030303030303ULL)\
697
                    + 0x0101010101010101ULL;\
698
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
699
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
700
        uint64_t l1,h1;\
701
\
702
        pixels+=line_size;\
703
        for(i=0; i<h; i+=2){\
704
            uint64_t a= LD64(pixels  );\
705
            uint64_t b= LD64(pixels+1);\
706
            l1=  (a&0x0303030303030303ULL)\
707
               + (b&0x0303030303030303ULL);\
708
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
711
            pixels+=line_size;\
712
            block +=line_size;\
713
            a= LD64(pixels  );\
714
            b= LD64(pixels+1);\
715
            l0=  (a&0x0303030303030303ULL)\
716
               + (b&0x0303030303030303ULL)\
717
               + 0x0101010101010101ULL;\
718
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
721
            pixels+=line_size;\
722
            block +=line_size;\
723
        }\
724
}\
725
\
726
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
727
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
728
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
729
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
730
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
731
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
732
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
733

734
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
735
#else // 64 bit variant
736

    
737
#define PIXOP2(OPNAME, OP) \
738
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
739
    int i;\
740
    for(i=0; i<h; i++){\
741
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
742
        pixels+=line_size;\
743
        block +=line_size;\
744
    }\
745
}\
746
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
747
    int i;\
748
    for(i=0; i<h; i++){\
749
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
750
        pixels+=line_size;\
751
        block +=line_size;\
752
    }\
753
}\
754
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
755
    int i;\
756
    for(i=0; i<h; i++){\
757
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
758
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
759
        pixels+=line_size;\
760
        block +=line_size;\
761
    }\
762
}\
763
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
764
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
765
}\
766
\
767
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
768
                                                int src_stride1, int src_stride2, int h){\
769
    int i;\
770
    for(i=0; i<h; i++){\
771
        uint32_t a,b;\
772
        a= LD32(&src1[i*src_stride1  ]);\
773
        b= LD32(&src2[i*src_stride2  ]);\
774
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
775
        a= LD32(&src1[i*src_stride1+4]);\
776
        b= LD32(&src2[i*src_stride2+4]);\
777
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
778
    }\
779
}\
780
\
781
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782
                                                int src_stride1, int src_stride2, int h){\
783
    int i;\
784
    for(i=0; i<h; i++){\
785
        uint32_t a,b;\
786
        a= LD32(&src1[i*src_stride1  ]);\
787
        b= LD32(&src2[i*src_stride2  ]);\
788
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
789
        a= LD32(&src1[i*src_stride1+4]);\
790
        b= LD32(&src2[i*src_stride2+4]);\
791
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
792
    }\
793
}\
794
\
795
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796
                                                int src_stride1, int src_stride2, int h){\
797
    int i;\
798
    for(i=0; i<h; i++){\
799
        uint32_t a,b;\
800
        a= LD32(&src1[i*src_stride1  ]);\
801
        b= LD32(&src2[i*src_stride2  ]);\
802
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
803
    }\
804
}\
805
\
806
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807
                                                int src_stride1, int src_stride2, int h){\
808
    int i;\
809
    for(i=0; i<h; i++){\
810
        uint32_t a,b;\
811
        a= LD16(&src1[i*src_stride1  ]);\
812
        b= LD16(&src2[i*src_stride2  ]);\
813
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
814
    }\
815
}\
816
\
817
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818
                                                int src_stride1, int src_stride2, int h){\
819
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
820
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
821
}\
822
\
823
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824
                                                int src_stride1, int src_stride2, int h){\
825
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
826
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
827
}\
828
\
829
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
831
}\
832
\
833
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
834
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
835
}\
836
\
837
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
838
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
839
}\
840
\
841
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
843
}\
844
\
845
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
846
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
847
    int i;\
848
    for(i=0; i<h; i++){\
849
        uint32_t a, b, c, d, l0, l1, h0, h1;\
850
        a= LD32(&src1[i*src_stride1]);\
851
        b= LD32(&src2[i*src_stride2]);\
852
        c= LD32(&src3[i*src_stride3]);\
853
        d= LD32(&src4[i*src_stride4]);\
854
        l0=  (a&0x03030303UL)\
855
           + (b&0x03030303UL)\
856
           + 0x02020202UL;\
857
        h0= ((a&0xFCFCFCFCUL)>>2)\
858
          + ((b&0xFCFCFCFCUL)>>2);\
859
        l1=  (c&0x03030303UL)\
860
           + (d&0x03030303UL);\
861
        h1= ((c&0xFCFCFCFCUL)>>2)\
862
          + ((d&0xFCFCFCFCUL)>>2);\
863
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864
        a= LD32(&src1[i*src_stride1+4]);\
865
        b= LD32(&src2[i*src_stride2+4]);\
866
        c= LD32(&src3[i*src_stride3+4]);\
867
        d= LD32(&src4[i*src_stride4+4]);\
868
        l0=  (a&0x03030303UL)\
869
           + (b&0x03030303UL)\
870
           + 0x02020202UL;\
871
        h0= ((a&0xFCFCFCFCUL)>>2)\
872
          + ((b&0xFCFCFCFCUL)>>2);\
873
        l1=  (c&0x03030303UL)\
874
           + (d&0x03030303UL);\
875
        h1= ((c&0xFCFCFCFCUL)>>2)\
876
          + ((d&0xFCFCFCFCUL)>>2);\
877
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878
    }\
879
}\
880
\
881
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
882
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
883
}\
884
\
885
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
887
}\
888
\
889
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
890
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
891
}\
892
\
893
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
895
}\
896
\
897
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
898
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
899
    int i;\
900
    for(i=0; i<h; i++){\
901
        uint32_t a, b, c, d, l0, l1, h0, h1;\
902
        a= LD32(&src1[i*src_stride1]);\
903
        b= LD32(&src2[i*src_stride2]);\
904
        c= LD32(&src3[i*src_stride3]);\
905
        d= LD32(&src4[i*src_stride4]);\
906
        l0=  (a&0x03030303UL)\
907
           + (b&0x03030303UL)\
908
           + 0x01010101UL;\
909
        h0= ((a&0xFCFCFCFCUL)>>2)\
910
          + ((b&0xFCFCFCFCUL)>>2);\
911
        l1=  (c&0x03030303UL)\
912
           + (d&0x03030303UL);\
913
        h1= ((c&0xFCFCFCFCUL)>>2)\
914
          + ((d&0xFCFCFCFCUL)>>2);\
915
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
916
        a= LD32(&src1[i*src_stride1+4]);\
917
        b= LD32(&src2[i*src_stride2+4]);\
918
        c= LD32(&src3[i*src_stride3+4]);\
919
        d= LD32(&src4[i*src_stride4+4]);\
920
        l0=  (a&0x03030303UL)\
921
           + (b&0x03030303UL)\
922
           + 0x01010101UL;\
923
        h0= ((a&0xFCFCFCFCUL)>>2)\
924
          + ((b&0xFCFCFCFCUL)>>2);\
925
        l1=  (c&0x03030303UL)\
926
           + (d&0x03030303UL);\
927
        h1= ((c&0xFCFCFCFCUL)>>2)\
928
          + ((d&0xFCFCFCFCUL)>>2);\
929
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930
    }\
931
}\
932
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
933
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
934
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
935
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936
}\
937
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
938
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
939
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941
}\
942
\
943
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
944
{\
945
        int i, a0, b0, a1, b1;\
946
        a0= pixels[0];\
947
        b0= pixels[1] + 2;\
948
        a0 += b0;\
949
        b0 += pixels[2];\
950
\
951
        pixels+=line_size;\
952
        for(i=0; i<h; i+=2){\
953
            a1= pixels[0];\
954
            b1= pixels[1];\
955
            a1 += b1;\
956
            b1 += pixels[2];\
957
\
958
            block[0]= (a1+a0)>>2; /* FIXME non put */\
959
            block[1]= (b1+b0)>>2;\
960
\
961
            pixels+=line_size;\
962
            block +=line_size;\
963
\
964
            a0= pixels[0];\
965
            b0= pixels[1] + 2;\
966
            a0 += b0;\
967
            b0 += pixels[2];\
968
\
969
            block[0]= (a1+a0)>>2;\
970
            block[1]= (b1+b0)>>2;\
971
            pixels+=line_size;\
972
            block +=line_size;\
973
        }\
974
}\
975
\
976
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
977
{\
978
        int i;\
979
        const uint32_t a= LD32(pixels  );\
980
        const uint32_t b= LD32(pixels+1);\
981
        uint32_t l0=  (a&0x03030303UL)\
982
                    + (b&0x03030303UL)\
983
                    + 0x02020202UL;\
984
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
985
                   + ((b&0xFCFCFCFCUL)>>2);\
986
        uint32_t l1,h1;\
987
\
988
        pixels+=line_size;\
989
        for(i=0; i<h; i+=2){\
990
            uint32_t a= LD32(pixels  );\
991
            uint32_t b= LD32(pixels+1);\
992
            l1=  (a&0x03030303UL)\
993
               + (b&0x03030303UL);\
994
            h1= ((a&0xFCFCFCFCUL)>>2)\
995
              + ((b&0xFCFCFCFCUL)>>2);\
996
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
997
            pixels+=line_size;\
998
            block +=line_size;\
999
            a= LD32(pixels  );\
1000
            b= LD32(pixels+1);\
1001
            l0=  (a&0x03030303UL)\
1002
               + (b&0x03030303UL)\
1003
               + 0x02020202UL;\
1004
            h0= ((a&0xFCFCFCFCUL)>>2)\
1005
              + ((b&0xFCFCFCFCUL)>>2);\
1006
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007
            pixels+=line_size;\
1008
            block +=line_size;\
1009
        }\
1010
}\
1011
\
1012
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1013
{\
1014
    int j;\
1015
    for(j=0; j<2; j++){\
1016
        int i;\
1017
        const uint32_t a= LD32(pixels  );\
1018
        const uint32_t b= LD32(pixels+1);\
1019
        uint32_t l0=  (a&0x03030303UL)\
1020
                    + (b&0x03030303UL)\
1021
                    + 0x02020202UL;\
1022
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023
                   + ((b&0xFCFCFCFCUL)>>2);\
1024
        uint32_t l1,h1;\
1025
\
1026
        pixels+=line_size;\
1027
        for(i=0; i<h; i+=2){\
1028
            uint32_t a= LD32(pixels  );\
1029
            uint32_t b= LD32(pixels+1);\
1030
            l1=  (a&0x03030303UL)\
1031
               + (b&0x03030303UL);\
1032
            h1= ((a&0xFCFCFCFCUL)>>2)\
1033
              + ((b&0xFCFCFCFCUL)>>2);\
1034
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035
            pixels+=line_size;\
1036
            block +=line_size;\
1037
            a= LD32(pixels  );\
1038
            b= LD32(pixels+1);\
1039
            l0=  (a&0x03030303UL)\
1040
               + (b&0x03030303UL)\
1041
               + 0x02020202UL;\
1042
            h0= ((a&0xFCFCFCFCUL)>>2)\
1043
              + ((b&0xFCFCFCFCUL)>>2);\
1044
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045
            pixels+=line_size;\
1046
            block +=line_size;\
1047
        }\
1048
        pixels+=4-line_size*(h+1);\
1049
        block +=4-line_size*h;\
1050
    }\
1051
}\
1052
\
1053
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054
{\
1055
    int j;\
1056
    for(j=0; j<2; j++){\
1057
        int i;\
1058
        const uint32_t a= LD32(pixels  );\
1059
        const uint32_t b= LD32(pixels+1);\
1060
        uint32_t l0=  (a&0x03030303UL)\
1061
                    + (b&0x03030303UL)\
1062
                    + 0x01010101UL;\
1063
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064
                   + ((b&0xFCFCFCFCUL)>>2);\
1065
        uint32_t l1,h1;\
1066
\
1067
        pixels+=line_size;\
1068
        for(i=0; i<h; i+=2){\
1069
            uint32_t a= LD32(pixels  );\
1070
            uint32_t b= LD32(pixels+1);\
1071
            l1=  (a&0x03030303UL)\
1072
               + (b&0x03030303UL);\
1073
            h1= ((a&0xFCFCFCFCUL)>>2)\
1074
              + ((b&0xFCFCFCFCUL)>>2);\
1075
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076
            pixels+=line_size;\
1077
            block +=line_size;\
1078
            a= LD32(pixels  );\
1079
            b= LD32(pixels+1);\
1080
            l0=  (a&0x03030303UL)\
1081
               + (b&0x03030303UL)\
1082
               + 0x01010101UL;\
1083
            h0= ((a&0xFCFCFCFCUL)>>2)\
1084
              + ((b&0xFCFCFCFCUL)>>2);\
1085
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086
            pixels+=line_size;\
1087
            block +=line_size;\
1088
        }\
1089
        pixels+=4-line_size*(h+1);\
1090
        block +=4-line_size*h;\
1091
    }\
1092
}\
1093
\
1094
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1095
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1099
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1102

    
1103
#define op_avg(a, b) a = rnd_avg32(a, b)
1104
#endif
1105
#define op_put(a, b) a = b
1106

    
1107
PIXOP2(avg, op_avg)
1108
PIXOP2(put, op_put)
1109
#undef op_avg
1110
#undef op_put
1111

    
1112
#define avg2(a,b) ((a+b+1)>>1)
1113
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114

    
1115
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117
}
1118

    
1119
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121
}
1122

    
1123
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1124
{
1125
    const int A=(16-x16)*(16-y16);
1126
    const int B=(   x16)*(16-y16);
1127
    const int C=(16-x16)*(   y16);
1128
    const int D=(   x16)*(   y16);
1129
    int i;
1130

    
1131
    for(i=0; i<h; i++)
1132
    {
1133
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141
        dst+= stride;
1142
        src+= stride;
1143
    }
1144
}
1145

    
1146
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148
{
1149
    int y, vx, vy;
1150
    const int s= 1<<shift;
1151

    
1152
    width--;
1153
    height--;
1154

    
1155
    for(y=0; y<h; y++){
1156
        int x;
1157

    
1158
        vx= ox;
1159
        vy= oy;
1160
        for(x=0; x<8; x++){ //XXX FIXME optimize
1161
            int src_x, src_y, frac_x, frac_y, index;
1162

    
1163
            src_x= vx>>16;
1164
            src_y= vy>>16;
1165
            frac_x= src_x&(s-1);
1166
            frac_y= src_y&(s-1);
1167
            src_x>>=shift;
1168
            src_y>>=shift;
1169

    
1170
            if((unsigned)src_x < width){
1171
                if((unsigned)src_y < height){
1172
                    index= src_x + src_y*stride;
1173
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1174
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1175
                                        + (  src[index+stride  ]*(s-frac_x)
1176
                                           + src[index+stride+1]*   frac_x )*   frac_y
1177
                                        + r)>>(shift*2);
1178
                }else{
1179
                    index= src_x + clip(src_y, 0, height)*stride;
1180
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1181
                                          + src[index       +1]*   frac_x )*s
1182
                                        + r)>>(shift*2);
1183
                }
1184
            }else{
1185
                if((unsigned)src_y < height){
1186
                    index= clip(src_x, 0, width) + src_y*stride;
1187
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1188
                                           + src[index+stride  ]*   frac_y )*s
1189
                                        + r)>>(shift*2);
1190
                }else{
1191
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192
                    dst[y*stride + x]=    src[index         ];
1193
                }
1194
            }
1195

    
1196
            vx+= dxx;
1197
            vy+= dyx;
1198
        }
1199
        ox += dxy;
1200
        oy += dyy;
1201
    }
1202
}
1203

    
1204
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205
    switch(width){
1206
    case 2: put_pixels2_c (dst, src, stride, height); break;
1207
    case 4: put_pixels4_c (dst, src, stride, height); break;
1208
    case 8: put_pixels8_c (dst, src, stride, height); break;
1209
    case 16:put_pixels16_c(dst, src, stride, height); break;
1210
    }
1211
}
1212

    
1213
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214
    int i,j;
1215
    for (i=0; i < height; i++) {
1216
      for (j=0; j < width; j++) {
1217
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1218
      }
1219
      src += stride;
1220
      dst += stride;
1221
    }
1222
}
1223

    
1224
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225
    int i,j;
1226
    for (i=0; i < height; i++) {
1227
      for (j=0; j < width; j++) {
1228
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1229
      }
1230
      src += stride;
1231
      dst += stride;
1232
    }
1233
}
1234

    
1235
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236
    int i,j;
1237
    for (i=0; i < height; i++) {
1238
      for (j=0; j < width; j++) {
1239
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1240
      }
1241
      src += stride;
1242
      dst += stride;
1243
    }
1244
}
1245

    
1246
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247
    int i,j;
1248
    for (i=0; i < height; i++) {
1249
      for (j=0; j < width; j++) {
1250
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1251
      }
1252
      src += stride;
1253
      dst += stride;
1254
    }
1255
}
1256

    
1257
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258
    int i,j;
1259
    for (i=0; i < height; i++) {
1260
      for (j=0; j < width; j++) {
1261
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1262
      }
1263
      src += stride;
1264
      dst += stride;
1265
    }
1266
}
1267

    
1268
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269
    int i,j;
1270
    for (i=0; i < height; i++) {
1271
      for (j=0; j < width; j++) {
1272
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1273
      }
1274
      src += stride;
1275
      dst += stride;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    int i,j;
1281
    for (i=0; i < height; i++) {
1282
      for (j=0; j < width; j++) {
1283
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1284
      }
1285
      src += stride;
1286
      dst += stride;
1287
    }
1288
}
1289

    
1290
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    int i,j;
1292
    for (i=0; i < height; i++) {
1293
      for (j=0; j < width; j++) {
1294
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1295
      }
1296
      src += stride;
1297
      dst += stride;
1298
    }
1299
}
1300

    
1301
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302
    switch(width){
1303
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1304
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1305
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1306
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1307
    }
1308
}
1309

    
1310
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311
    int i,j;
1312
    for (i=0; i < height; i++) {
1313
      for (j=0; j < width; j++) {
1314
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1315
      }
1316
      src += stride;
1317
      dst += stride;
1318
    }
1319
}
1320

    
1321
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    int i,j;
1323
    for (i=0; i < height; i++) {
1324
      for (j=0; j < width; j++) {
1325
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1326
      }
1327
      src += stride;
1328
      dst += stride;
1329
    }
1330
}
1331

    
1332
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    int i,j;
1378
    for (i=0; i < height; i++) {
1379
      for (j=0; j < width; j++) {
1380
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1381
      }
1382
      src += stride;
1383
      dst += stride;
1384
    }
1385
}
1386

    
1387
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388
    int i,j;
1389
    for (i=0; i < height; i++) {
1390
      for (j=0; j < width; j++) {
1391
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392
      }
1393
      src += stride;
1394
      dst += stride;
1395
    }
1396
}
1397
#if 0
1398
#define TPEL_WIDTH(width)\
1399
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417
#endif
1418

    
1419
#define H264_CHROMA_MC(OPNAME, OP)\
1420
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421
    const int A=(8-x)*(8-y);\
1422
    const int B=(  x)*(8-y);\
1423
    const int C=(8-x)*(  y);\
1424
    const int D=(  x)*(  y);\
1425
    int i;\
1426
    \
1427
    assert(x<8 && y<8 && x>=0 && y>=0);\
1428
\
1429
    for(i=0; i<h; i++)\
1430
    {\
1431
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433
        dst+= stride;\
1434
        src+= stride;\
1435
    }\
1436
}\
1437
\
1438
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439
    const int A=(8-x)*(8-y);\
1440
    const int B=(  x)*(8-y);\
1441
    const int C=(8-x)*(  y);\
1442
    const int D=(  x)*(  y);\
1443
    int i;\
1444
    \
1445
    assert(x<8 && y<8 && x>=0 && y>=0);\
1446
\
1447
    for(i=0; i<h; i++)\
1448
    {\
1449
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453
        dst+= stride;\
1454
        src+= stride;\
1455
    }\
1456
}\
1457
\
1458
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459
    const int A=(8-x)*(8-y);\
1460
    const int B=(  x)*(8-y);\
1461
    const int C=(8-x)*(  y);\
1462
    const int D=(  x)*(  y);\
1463
    int i;\
1464
    \
1465
    assert(x<8 && y<8 && x>=0 && y>=0);\
1466
\
1467
    for(i=0; i<h; i++)\
1468
    {\
1469
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477
        dst+= stride;\
1478
        src+= stride;\
1479
    }\
1480
}
1481

    
1482
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483
#define op_put(a, b) a = (((b) + 32)>>6)
1484

    
1485
H264_CHROMA_MC(put_       , op_put)
1486
H264_CHROMA_MC(avg_       , op_avg)
1487
#undef op_avg
1488
#undef op_put
1489

    
1490
static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491
{
1492
    int i;
1493
    for(i=0; i<h; i++)
1494
    {
1495
        ST16(dst   , LD16(src   ));
1496
        dst+=dstStride;
1497
        src+=srcStride;
1498
    }
1499
}
1500

    
1501
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502
{
1503
    int i;
1504
    for(i=0; i<h; i++)
1505
    {
1506
        ST32(dst   , LD32(src   ));
1507
        dst+=dstStride;
1508
        src+=srcStride;
1509
    }
1510
}
1511

    
1512
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1513
{
1514
    int i;
1515
    for(i=0; i<h; i++)
1516
    {
1517
        ST32(dst   , LD32(src   ));
1518
        ST32(dst+4 , LD32(src+4 ));
1519
        dst+=dstStride;
1520
        src+=srcStride;
1521
    }
1522
}
1523

    
1524
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1525
{
1526
    int i;
1527
    for(i=0; i<h; i++)
1528
    {
1529
        ST32(dst   , LD32(src   ));
1530
        ST32(dst+4 , LD32(src+4 ));
1531
        ST32(dst+8 , LD32(src+8 ));
1532
        ST32(dst+12, LD32(src+12));
1533
        dst+=dstStride;
1534
        src+=srcStride;
1535
    }
1536
}
1537

    
1538
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1539
{
1540
    int i;
1541
    for(i=0; i<h; i++)
1542
    {
1543
        ST32(dst   , LD32(src   ));
1544
        ST32(dst+4 , LD32(src+4 ));
1545
        ST32(dst+8 , LD32(src+8 ));
1546
        ST32(dst+12, LD32(src+12));
1547
        dst[16]= src[16];
1548
        dst+=dstStride;
1549
        src+=srcStride;
1550
    }
1551
}
1552

    
1553
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1554
{
1555
    int i;
1556
    for(i=0; i<h; i++)
1557
    {
1558
        ST32(dst   , LD32(src   ));
1559
        ST32(dst+4 , LD32(src+4 ));
1560
        dst[8]= src[8];
1561
        dst+=dstStride;
1562
        src+=srcStride;
1563
    }
1564
}
1565

    
1566

    
1567
#define QPEL_MC(r, OPNAME, RND, OP) \
1568
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1569
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1570
    int i;\
1571
    for(i=0; i<h; i++)\
1572
    {\
1573
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1574
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1575
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1576
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1577
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1578
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1579
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1580
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1581
        dst+=dstStride;\
1582
        src+=srcStride;\
1583
    }\
1584
}\
1585
\
1586
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1587
    const int w=8;\
1588
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1589
    int i;\
1590
    for(i=0; i<w; i++)\
1591
    {\
1592
        const int src0= src[0*srcStride];\
1593
        const int src1= src[1*srcStride];\
1594
        const int src2= src[2*srcStride];\
1595
        const int src3= src[3*srcStride];\
1596
        const int src4= src[4*srcStride];\
1597
        const int src5= src[5*srcStride];\
1598
        const int src6= src[6*srcStride];\
1599
        const int src7= src[7*srcStride];\
1600
        const int src8= src[8*srcStride];\
1601
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1602
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1603
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1604
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1605
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1606
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1607
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1608
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1609
        dst++;\
1610
        src++;\
1611
    }\
1612
}\
1613
\
1614
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1615
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1616
    int i;\
1617
    \
1618
    for(i=0; i<h; i++)\
1619
    {\
1620
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1621
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1622
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1623
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1624
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1625
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1626
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1627
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1628
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1629
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1630
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1631
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1632
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1633
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1634
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1635
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1636
        dst+=dstStride;\
1637
        src+=srcStride;\
1638
    }\
1639
}\
1640
\
1641
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1643
    int i;\
1644
    const int w=16;\
1645
    for(i=0; i<w; i++)\
1646
    {\
1647
        const int src0= src[0*srcStride];\
1648
        const int src1= src[1*srcStride];\
1649
        const int src2= src[2*srcStride];\
1650
        const int src3= src[3*srcStride];\
1651
        const int src4= src[4*srcStride];\
1652
        const int src5= src[5*srcStride];\
1653
        const int src6= src[6*srcStride];\
1654
        const int src7= src[7*srcStride];\
1655
        const int src8= src[8*srcStride];\
1656
        const int src9= src[9*srcStride];\
1657
        const int src10= src[10*srcStride];\
1658
        const int src11= src[11*srcStride];\
1659
        const int src12= src[12*srcStride];\
1660
        const int src13= src[13*srcStride];\
1661
        const int src14= src[14*srcStride];\
1662
        const int src15= src[15*srcStride];\
1663
        const int src16= src[16*srcStride];\
1664
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1665
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1666
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1667
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1668
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1669
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1670
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1671
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1672
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1673
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1674
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1675
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1676
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1677
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1678
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1679
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1680
        dst++;\
1681
        src++;\
1682
    }\
1683
}\
1684
\
1685
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1686
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1687
}\
1688
\
1689
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1690
    uint8_t half[64];\
1691
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1692
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1693
}\
1694
\
1695
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1696
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1697
}\
1698
\
1699
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1700
    uint8_t half[64];\
1701
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1702
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1703
}\
1704
\
1705
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1706
    uint8_t full[16*9];\
1707
    uint8_t half[64];\
1708
    copy_block9(full, src, 16, stride, 9);\
1709
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1710
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1711
}\
1712
\
1713
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1714
    uint8_t full[16*9];\
1715
    copy_block9(full, src, 16, stride, 9);\
1716
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1717
}\
1718
\
1719
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1720
    uint8_t full[16*9];\
1721
    uint8_t half[64];\
1722
    copy_block9(full, src, 16, stride, 9);\
1723
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1724
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1725
}\
1726
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727
    uint8_t full[16*9];\
1728
    uint8_t halfH[72];\
1729
    uint8_t halfV[64];\
1730
    uint8_t halfHV[64];\
1731
    copy_block9(full, src, 16, stride, 9);\
1732
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736
}\
1737
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1738
    uint8_t full[16*9];\
1739
    uint8_t halfH[72];\
1740
    uint8_t halfHV[64];\
1741
    copy_block9(full, src, 16, stride, 9);\
1742
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1746
}\
1747
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748
    uint8_t full[16*9];\
1749
    uint8_t halfH[72];\
1750
    uint8_t halfV[64];\
1751
    uint8_t halfHV[64];\
1752
    copy_block9(full, src, 16, stride, 9);\
1753
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1754
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757
}\
1758
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1759
    uint8_t full[16*9];\
1760
    uint8_t halfH[72];\
1761
    uint8_t halfHV[64];\
1762
    copy_block9(full, src, 16, stride, 9);\
1763
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1767
}\
1768
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t full[16*9];\
1770
    uint8_t halfH[72];\
1771
    uint8_t halfV[64];\
1772
    uint8_t halfHV[64];\
1773
    copy_block9(full, src, 16, stride, 9);\
1774
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1776
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1778
}\
1779
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1780
    uint8_t full[16*9];\
1781
    uint8_t halfH[72];\
1782
    uint8_t halfHV[64];\
1783
    copy_block9(full, src, 16, stride, 9);\
1784
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1785
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1786
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1788
}\
1789
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1790
    uint8_t full[16*9];\
1791
    uint8_t halfH[72];\
1792
    uint8_t halfV[64];\
1793
    uint8_t halfHV[64];\
1794
    copy_block9(full, src, 16, stride, 9);\
1795
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1796
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1797
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1799
}\
1800
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1801
    uint8_t full[16*9];\
1802
    uint8_t halfH[72];\
1803
    uint8_t halfHV[64];\
1804
    copy_block9(full, src, 16, stride, 9);\
1805
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1807
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1809
}\
1810
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1811
    uint8_t halfH[72];\
1812
    uint8_t halfHV[64];\
1813
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1816
}\
1817
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1818
    uint8_t halfH[72];\
1819
    uint8_t halfHV[64];\
1820
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1821
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1823
}\
1824
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825
    uint8_t full[16*9];\
1826
    uint8_t halfH[72];\
1827
    uint8_t halfV[64];\
1828
    uint8_t halfHV[64];\
1829
    copy_block9(full, src, 16, stride, 9);\
1830
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1831
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1832
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1834
}\
1835
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1836
    uint8_t full[16*9];\
1837
    uint8_t halfH[72];\
1838
    copy_block9(full, src, 16, stride, 9);\
1839
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1840
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1841
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1842
}\
1843
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t full[16*9];\
1845
    uint8_t halfH[72];\
1846
    uint8_t halfV[64];\
1847
    uint8_t halfHV[64];\
1848
    copy_block9(full, src, 16, stride, 9);\
1849
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1851
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1853
}\
1854
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1855
    uint8_t full[16*9];\
1856
    uint8_t halfH[72];\
1857
    copy_block9(full, src, 16, stride, 9);\
1858
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1860
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1861
}\
1862
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1863
    uint8_t halfH[72];\
1864
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1865
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1866
}\
1867
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1868
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1869
}\
1870
\
1871
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1872
    uint8_t half[256];\
1873
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1874
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1875
}\
1876
\
1877
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1878
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1879
}\
1880
\
1881
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1882
    uint8_t half[256];\
1883
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1884
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1885
}\
1886
\
1887
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1888
    uint8_t full[24*17];\
1889
    uint8_t half[256];\
1890
    copy_block17(full, src, 24, stride, 17);\
1891
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1892
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1893
}\
1894
\
1895
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1896
    uint8_t full[24*17];\
1897
    copy_block17(full, src, 24, stride, 17);\
1898
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1899
}\
1900
\
1901
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1902
    uint8_t full[24*17];\
1903
    uint8_t half[256];\
1904
    copy_block17(full, src, 24, stride, 17);\
1905
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1906
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1907
}\
1908
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909
    uint8_t full[24*17];\
1910
    uint8_t halfH[272];\
1911
    uint8_t halfV[256];\
1912
    uint8_t halfHV[256];\
1913
    copy_block17(full, src, 24, stride, 17);\
1914
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918
}\
1919
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1920
    uint8_t full[24*17];\
1921
    uint8_t halfH[272];\
1922
    uint8_t halfHV[256];\
1923
    copy_block17(full, src, 24, stride, 17);\
1924
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1928
}\
1929
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t full[24*17];\
1931
    uint8_t halfH[272];\
1932
    uint8_t halfV[256];\
1933
    uint8_t halfHV[256];\
1934
    copy_block17(full, src, 24, stride, 17);\
1935
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1936
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939
}\
1940
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1941
    uint8_t full[24*17];\
1942
    uint8_t halfH[272];\
1943
    uint8_t halfHV[256];\
1944
    copy_block17(full, src, 24, stride, 17);\
1945
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1949
}\
1950
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1951
    uint8_t full[24*17];\
1952
    uint8_t halfH[272];\
1953
    uint8_t halfV[256];\
1954
    uint8_t halfHV[256];\
1955
    copy_block17(full, src, 24, stride, 17);\
1956
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1958
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1960
}\
1961
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1962
    uint8_t full[24*17];\
1963
    uint8_t halfH[272];\
1964
    uint8_t halfHV[256];\
1965
    copy_block17(full, src, 24, stride, 17);\
1966
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1967
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1968
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1970
}\
1971
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1972
    uint8_t full[24*17];\
1973
    uint8_t halfH[272];\
1974
    uint8_t halfV[256];\
1975
    uint8_t halfHV[256];\
1976
    copy_block17(full, src, 24, stride, 17);\
1977
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1978
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1979
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1981
}\
1982
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1983
    uint8_t full[24*17];\
1984
    uint8_t halfH[272];\
1985
    uint8_t halfHV[256];\
1986
    copy_block17(full, src, 24, stride, 17);\
1987
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1988
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1989
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1990
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1991
}\
1992
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1993
    uint8_t halfH[272];\
1994
    uint8_t halfHV[256];\
1995
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1998
}\
1999
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2000
    uint8_t halfH[272];\
2001
    uint8_t halfHV[256];\
2002
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2005
}\
2006
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t full[24*17];\
2008
    uint8_t halfH[272];\
2009
    uint8_t halfV[256];\
2010
    uint8_t halfHV[256];\
2011
    copy_block17(full, src, 24, stride, 17);\
2012
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2014
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2016
}\
2017
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2018
    uint8_t full[24*17];\
2019
    uint8_t halfH[272];\
2020
    copy_block17(full, src, 24, stride, 17);\
2021
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2022
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2023
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2024
}\
2025
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t full[24*17];\
2027
    uint8_t halfH[272];\
2028
    uint8_t halfV[256];\
2029
    uint8_t halfHV[256];\
2030
    copy_block17(full, src, 24, stride, 17);\
2031
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2033
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2035
}\
2036
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2037
    uint8_t full[24*17];\
2038
    uint8_t halfH[272];\
2039
    copy_block17(full, src, 24, stride, 17);\
2040
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2041
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2042
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2043
}\
2044
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2045
    uint8_t halfH[272];\
2046
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2047
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2048
}
2049

    
2050
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2051
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2052
#define op_put(a, b) a = cm[((b) + 16)>>5]
2053
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2054

    
2055
QPEL_MC(0, put_       , _       , op_put)
2056
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2057
QPEL_MC(0, avg_       , _       , op_avg)
2058
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2059
#undef op_avg
2060
#undef op_avg_no_rnd
2061
#undef op_put
2062
#undef op_put_no_rnd
2063

    
2064
#if 1
2065
#define H264_LOWPASS(OPNAME, OP, OP2) \
2066
static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2067
    const int h=2;\
2068
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2069
    int i;\
2070
    for(i=0; i<h; i++)\
2071
    {\
2072
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2073
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2074
        dst+=dstStride;\
2075
        src+=srcStride;\
2076
    }\
2077
}\
2078
\
2079
static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2080
    const int w=2;\
2081
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2082
    int i;\
2083
    for(i=0; i<w; i++)\
2084
    {\
2085
        const int srcB= src[-2*srcStride];\
2086
        const int srcA= src[-1*srcStride];\
2087
        const int src0= src[0 *srcStride];\
2088
        const int src1= src[1 *srcStride];\
2089
        const int src2= src[2 *srcStride];\
2090
        const int src3= src[3 *srcStride];\
2091
        const int src4= src[4 *srcStride];\
2092
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2093
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2094
        dst++;\
2095
        src++;\
2096
    }\
2097
}\
2098
\
2099
static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2100
    const int h=2;\
2101
    const int w=2;\
2102
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2103
    int i;\
2104
    src -= 2*srcStride;\
2105
    for(i=0; i<h+5; i++)\
2106
    {\
2107
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2108
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2109
        tmp+=tmpStride;\
2110
        src+=srcStride;\
2111
    }\
2112
    tmp -= tmpStride*(h+5-2);\
2113
    for(i=0; i<w; i++)\
2114
    {\
2115
        const int tmpB= tmp[-2*tmpStride];\
2116
        const int tmpA= tmp[-1*tmpStride];\
2117
        const int tmp0= tmp[0 *tmpStride];\
2118
        const int tmp1= tmp[1 *tmpStride];\
2119
        const int tmp2= tmp[2 *tmpStride];\
2120
        const int tmp3= tmp[3 *tmpStride];\
2121
        const int tmp4= tmp[4 *tmpStride];\
2122
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2123
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2124
        dst++;\
2125
        tmp++;\
2126
    }\
2127
}\
2128
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2129
    const int h=4;\
2130
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2131
    int i;\
2132
    for(i=0; i<h; i++)\
2133
    {\
2134
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2135
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2136
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2137
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2138
        dst+=dstStride;\
2139
        src+=srcStride;\
2140
    }\
2141
}\
2142
\
2143
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2144
    const int w=4;\
2145
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2146
    int i;\
2147
    for(i=0; i<w; i++)\
2148
    {\
2149
        const int srcB= src[-2*srcStride];\
2150
        const int srcA= src[-1*srcStride];\
2151
        const int src0= src[0 *srcStride];\
2152
        const int src1= src[1 *srcStride];\
2153
        const int src2= src[2 *srcStride];\
2154
        const int src3= src[3 *srcStride];\
2155
        const int src4= src[4 *srcStride];\
2156
        const int src5= src[5 *srcStride];\
2157
        const int src6= src[6 *srcStride];\
2158
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2159
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2160
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2161
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2162
        dst++;\
2163
        src++;\
2164
    }\
2165
}\
2166
\
2167
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2168
    const int h=4;\
2169
    const int w=4;\
2170
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2171
    int i;\
2172
    src -= 2*srcStride;\
2173
    for(i=0; i<h+5; i++)\
2174
    {\
2175
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2176
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2177
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2178
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2179
        tmp+=tmpStride;\
2180
        src+=srcStride;\
2181
    }\
2182
    tmp -= tmpStride*(h+5-2);\
2183
    for(i=0; i<w; i++)\
2184
    {\
2185
        const int tmpB= tmp[-2*tmpStride];\
2186
        const int tmpA= tmp[-1*tmpStride];\
2187
        const int tmp0= tmp[0 *tmpStride];\
2188
        const int tmp1= tmp[1 *tmpStride];\
2189
        const int tmp2= tmp[2 *tmpStride];\
2190
        const int tmp3= tmp[3 *tmpStride];\
2191
        const int tmp4= tmp[4 *tmpStride];\
2192
        const int tmp5= tmp[5 *tmpStride];\
2193
        const int tmp6= tmp[6 *tmpStride];\
2194
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2195
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2196
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2197
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2198
        dst++;\
2199
        tmp++;\
2200
    }\
2201
}\
2202
\
2203
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204
    const int h=8;\
2205
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2206
    int i;\
2207
    for(i=0; i<h; i++)\
2208
    {\
2209
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2210
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2211
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2212
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2213
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2214
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2215
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2216
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2217
        dst+=dstStride;\
2218
        src+=srcStride;\
2219
    }\
2220
}\
2221
\
2222
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2223
    const int w=8;\
2224
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2225
    int i;\
2226
    for(i=0; i<w; i++)\
2227
    {\
2228
        const int srcB= src[-2*srcStride];\
2229
        const int srcA= src[-1*srcStride];\
2230
        const int src0= src[0 *srcStride];\
2231
        const int src1= src[1 *srcStride];\
2232
        const int src2= src[2 *srcStride];\
2233
        const int src3= src[3 *srcStride];\
2234
        const int src4= src[4 *srcStride];\
2235
        const int src5= src[5 *srcStride];\
2236
        const int src6= src[6 *srcStride];\
2237
        const int src7= src[7 *srcStride];\
2238
        const int src8= src[8 *srcStride];\
2239
        const int src9= src[9 *srcStride];\
2240
        const int src10=src[10*srcStride];\
2241
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2242
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2243
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2244
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2245
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2246
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2247
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2248
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2249
        dst++;\
2250
        src++;\
2251
    }\
2252
}\
2253
\
2254
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2255
    const int h=8;\
2256
    const int w=8;\
2257
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2258
    int i;\
2259
    src -= 2*srcStride;\
2260
    for(i=0; i<h+5; i++)\
2261
    {\
2262
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2263
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2264
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2265
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2266
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2267
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2268
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2269
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2270
        tmp+=tmpStride;\
2271
        src+=srcStride;\
2272
    }\
2273
    tmp -= tmpStride*(h+5-2);\
2274
    for(i=0; i<w; i++)\
2275
    {\
2276
        const int tmpB= tmp[-2*tmpStride];\
2277
        const int tmpA= tmp[-1*tmpStride];\
2278
        const int tmp0= tmp[0 *tmpStride];\
2279
        const int tmp1= tmp[1 *tmpStride];\
2280
        const int tmp2= tmp[2 *tmpStride];\
2281
        const int tmp3= tmp[3 *tmpStride];\
2282
        const int tmp4= tmp[4 *tmpStride];\
2283
        const int tmp5= tmp[5 *tmpStride];\
2284
        const int tmp6= tmp[6 *tmpStride];\
2285
        const int tmp7= tmp[7 *tmpStride];\
2286
        const int tmp8= tmp[8 *tmpStride];\
2287
        const int tmp9= tmp[9 *tmpStride];\
2288
        const int tmp10=tmp[10*tmpStride];\
2289
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2290
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2291
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2292
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2293
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2294
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2295
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2296
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2297
        dst++;\
2298
        tmp++;\
2299
    }\
2300
}\
2301
\
2302
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2303
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2304
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2305
    src += 8*srcStride;\
2306
    dst += 8*dstStride;\
2307
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2308
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2309
}\
2310
\
2311
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2312
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2313
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2314
    src += 8*srcStride;\
2315
    dst += 8*dstStride;\
2316
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2317
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2318
}\
2319
\
2320
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2321
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2322
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2323
    src += 8*srcStride;\
2324
    dst += 8*dstStride;\
2325
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2326
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2327
}\
2328

    
2329
#define H264_MC(OPNAME, SIZE) \
2330
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2331
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2332
}\
2333
\
2334
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2335
    uint8_t half[SIZE*SIZE];\
2336
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2337
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2338
}\
2339
\
2340
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2341
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2342
}\
2343
\
2344
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2345
    uint8_t half[SIZE*SIZE];\
2346
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2347
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2348
}\
2349
\
2350
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2351
    uint8_t full[SIZE*(SIZE+5)];\
2352
    uint8_t * const full_mid= full + SIZE*2;\
2353
    uint8_t half[SIZE*SIZE];\
2354
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2355
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2356
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2357
}\
2358
\
2359
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2360
    uint8_t full[SIZE*(SIZE+5)];\
2361
    uint8_t * const full_mid= full + SIZE*2;\
2362
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2363
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2367
    uint8_t full[SIZE*(SIZE+5)];\
2368
    uint8_t * const full_mid= full + SIZE*2;\
2369
    uint8_t half[SIZE*SIZE];\
2370
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2371
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2372
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2373
}\
2374
\
2375
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2376
    uint8_t full[SIZE*(SIZE+5)];\
2377
    uint8_t * const full_mid= full + SIZE*2;\
2378
    uint8_t halfH[SIZE*SIZE];\
2379
    uint8_t halfV[SIZE*SIZE];\
2380
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2381
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2382
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2383
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2384
}\
2385
\
2386
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2387
    uint8_t full[SIZE*(SIZE+5)];\
2388
    uint8_t * const full_mid= full + SIZE*2;\
2389
    uint8_t halfH[SIZE*SIZE];\
2390
    uint8_t halfV[SIZE*SIZE];\
2391
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2392
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2393
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2394
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2395
}\
2396
\
2397
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2398
    uint8_t full[SIZE*(SIZE+5)];\
2399
    uint8_t * const full_mid= full + SIZE*2;\
2400
    uint8_t halfH[SIZE*SIZE];\
2401
    uint8_t halfV[SIZE*SIZE];\
2402
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2403
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2404
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2405
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2406
}\
2407
\
2408
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2409
    uint8_t full[SIZE*(SIZE+5)];\
2410
    uint8_t * const full_mid= full + SIZE*2;\
2411
    uint8_t halfH[SIZE*SIZE];\
2412
    uint8_t halfV[SIZE*SIZE];\
2413
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2414
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2415
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2416
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2417
}\
2418
\
2419
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2420
    int16_t tmp[SIZE*(SIZE+5)];\
2421
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2422
}\
2423
\
2424
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2425
    int16_t tmp[SIZE*(SIZE+5)];\
2426
    uint8_t halfH[SIZE*SIZE];\
2427
    uint8_t halfHV[SIZE*SIZE];\
2428
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2429
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2430
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2431
}\
2432
\
2433
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2434
    int16_t tmp[SIZE*(SIZE+5)];\
2435
    uint8_t halfH[SIZE*SIZE];\
2436
    uint8_t halfHV[SIZE*SIZE];\
2437
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2438
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2439
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2440
}\
2441
\
2442
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2443
    uint8_t full[SIZE*(SIZE+5)];\
2444
    uint8_t * const full_mid= full + SIZE*2;\
2445
    int16_t tmp[SIZE*(SIZE+5)];\
2446
    uint8_t halfV[SIZE*SIZE];\
2447
    uint8_t halfHV[SIZE*SIZE];\
2448
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2449
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2450
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2451
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2452
}\
2453
\
2454
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2455
    uint8_t full[SIZE*(SIZE+5)];\
2456
    uint8_t * const full_mid= full + SIZE*2;\
2457
    int16_t tmp[SIZE*(SIZE+5)];\
2458
    uint8_t halfV[SIZE*SIZE];\
2459
    uint8_t halfHV[SIZE*SIZE];\
2460
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2461
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2463
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2464
}\
2465

    
2466
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2467
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2468
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2469
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2470
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2471

    
2472
H264_LOWPASS(put_       , op_put, op2_put)
2473
H264_LOWPASS(avg_       , op_avg, op2_avg)
2474
H264_MC(put_, 2)
2475
H264_MC(put_, 4)
2476
H264_MC(put_, 8)
2477
H264_MC(put_, 16)
2478
H264_MC(avg_, 4)
2479
H264_MC(avg_, 8)
2480
H264_MC(avg_, 16)
2481

    
2482
#undef op_avg
2483
#undef op_put
2484
#undef op2_avg
2485
#undef op2_put
2486
#endif
2487

    
2488
#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2489
#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2490
#define H264_WEIGHT(W,H) \
2491
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2492
    int y; \
2493
    offset <<= log2_denom; \
2494
    if(log2_denom) offset += 1<<(log2_denom-1); \
2495
    for(y=0; y<H; y++, block += stride){ \
2496
        op_scale1(0); \
2497
        op_scale1(1); \
2498
        if(W==2) continue; \
2499
        op_scale1(2); \
2500
        op_scale1(3); \
2501
        if(W==4) continue; \
2502
        op_scale1(4); \
2503
        op_scale1(5); \
2504
        op_scale1(6); \
2505
        op_scale1(7); \
2506
        if(W==8) continue; \
2507
        op_scale1(8); \
2508
        op_scale1(9); \
2509
        op_scale1(10); \
2510
        op_scale1(11); \
2511
        op_scale1(12); \
2512
        op_scale1(13); \
2513
        op_scale1(14); \
2514
        op_scale1(15); \
2515
    } \
2516
} \
2517
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2518
    int y; \
2519
    offset = ((offset + 1) | 1) << log2_denom; \
2520
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2521
        op_scale2(0); \
2522
        op_scale2(1); \
2523
        if(W==2) continue; \
2524
        op_scale2(2); \
2525
        op_scale2(3); \
2526
        if(W==4) continue; \
2527
        op_scale2(4); \
2528
        op_scale2(5); \
2529
        op_scale2(6); \
2530
        op_scale2(7); \
2531
        if(W==8) continue; \
2532
        op_scale2(8); \
2533
        op_scale2(9); \
2534
        op_scale2(10); \
2535
        op_scale2(11); \
2536
        op_scale2(12); \
2537
        op_scale2(13); \
2538
        op_scale2(14); \
2539
        op_scale2(15); \
2540
    } \
2541
}
2542

    
2543
H264_WEIGHT(16,16)
2544
H264_WEIGHT(16,8)
2545
H264_WEIGHT(8,16)
2546
H264_WEIGHT(8,8)
2547
H264_WEIGHT(8,4)
2548
H264_WEIGHT(4,8)
2549
H264_WEIGHT(4,4)
2550
H264_WEIGHT(4,2)
2551
H264_WEIGHT(2,4)
2552
H264_WEIGHT(2,2)
2553

    
2554
#undef op_scale1
2555
#undef op_scale2
2556
#undef H264_WEIGHT
2557

    
2558
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2559
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2560
    int i;
2561

    
2562
    for(i=0; i<h; i++){
2563
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2564
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2565
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2566
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2567
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2568
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2569
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2570
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2571
        dst+=dstStride;
2572
        src+=srcStride;
2573
    }
2574
}
2575

    
2576
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2577
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2578
    int i;
2579

    
2580
    for(i=0; i<w; i++){
2581
        const int src_1= src[ -srcStride];
2582
        const int src0 = src[0          ];
2583
        const int src1 = src[  srcStride];
2584
        const int src2 = src[2*srcStride];
2585
        const int src3 = src[3*srcStride];
2586
        const int src4 = src[4*srcStride];
2587
        const int src5 = src[5*srcStride];
2588
        const int src6 = src[6*srcStride];
2589
        const int src7 = src[7*srcStride];
2590
        const int src8 = src[8*srcStride];
2591
        const int src9 = src[9*srcStride];
2592
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2593
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2594
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2595
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2596
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2597
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2598
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2599
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2600
        src++;
2601
        dst++;
2602
    }
2603
}
2604

    
2605
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2606
    put_pixels8_c(dst, src, stride, 8);
2607
}
2608

    
2609
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2610
    uint8_t half[64];
2611
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2613
}
2614

    
2615
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2616
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2617
}
2618

    
2619
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2620
    uint8_t half[64];
2621
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2622
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2623
}
2624

    
2625
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2626
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2627
}
2628

    
2629
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2630
    uint8_t halfH[88];
2631
    uint8_t halfV[64];
2632
    uint8_t halfHV[64];
2633
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2634
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2635
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2636
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2637
}
2638
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2639
    uint8_t halfH[88];
2640
    uint8_t halfV[64];
2641
    uint8_t halfHV[64];
2642
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2643
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2644
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2645
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2646
}
2647
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2648
    uint8_t halfH[88];
2649
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2650
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2651
}
2652

    
2653
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2654
    int x;
2655
    const int strength= ff_h263_loop_filter_strength[qscale];
2656

    
2657
    for(x=0; x<8; x++){
2658
        int d1, d2, ad1;
2659
        int p0= src[x-2*stride];
2660
        int p1= src[x-1*stride];
2661
        int p2= src[x+0*stride];
2662
        int p3= src[x+1*stride];
2663
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2664

    
2665
        if     (d<-2*strength) d1= 0;
2666
        else if(d<-  strength) d1=-2*strength - d;
2667
        else if(d<   strength) d1= d;
2668
        else if(d< 2*strength) d1= 2*strength - d;
2669
        else                   d1= 0;
2670

    
2671
        p1 += d1;
2672
        p2 -= d1;
2673
        if(p1&256) p1= ~(p1>>31);
2674
        if(p2&256) p2= ~(p2>>31);
2675

    
2676
        src[x-1*stride] = p1;
2677
        src[x+0*stride] = p2;
2678

    
2679
        ad1= ABS(d1)>>1;
2680

    
2681
        d2= clip((p0-p3)/4, -ad1, ad1);
2682

    
2683
        src[x-2*stride] = p0 - d2;
2684
        src[x+  stride] = p3 + d2;
2685
    }
2686
}
2687

    
2688
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2689
    int y;
2690
    const int strength= ff_h263_loop_filter_strength[qscale];
2691

    
2692
    for(y=0; y<8; y++){
2693
        int d1, d2, ad1;
2694
        int p0= src[y*stride-2];
2695
        int p1= src[y*stride-1];
2696
        int p2= src[y*stride+0];
2697
        int p3= src[y*stride+1];
2698
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2699

    
2700
        if     (d<-2*strength) d1= 0;
2701
        else if(d<-  strength) d1=-2*strength - d;
2702
        else if(d<   strength) d1= d;
2703
        else if(d< 2*strength) d1= 2*strength - d;
2704
        else                   d1= 0;
2705

    
2706
        p1 += d1;
2707
        p2 -= d1;
2708
        if(p1&256) p1= ~(p1>>31);
2709
        if(p2&256) p2= ~(p2>>31);
2710

    
2711
        src[y*stride-1] = p1;
2712
        src[y*stride+0] = p2;
2713

    
2714
        ad1= ABS(d1)>>1;
2715

    
2716
        d2= clip((p0-p3)/4, -ad1, ad1);
2717

    
2718
        src[y*stride-2] = p0 - d2;
2719
        src[y*stride+1] = p3 + d2;
2720
    }
2721
}
2722

    
2723
static void h261_loop_filter_c(uint8_t *src, int stride){
2724
    int x,y,xy,yz;
2725
    int temp[64];
2726

    
2727
    for(x=0; x<8; x++){
2728
        temp[x      ] = 4*src[x           ];
2729
        temp[x + 7*8] = 4*src[x + 7*stride];
2730
    }
2731
    for(y=1; y<7; y++){
2732
        for(x=0; x<8; x++){
2733
            xy = y * stride + x;
2734
            yz = y * 8 + x;
2735
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2736
        }
2737
    }
2738

    
2739
    for(y=0; y<8; y++){
2740
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2741
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2742
        for(x=1; x<7; x++){
2743
            xy = y * stride + x;
2744
            yz = y * 8 + x;
2745
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2746
        }
2747
    }
2748
}
2749

    
2750
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2751
{
2752
    int i, d;
2753
    for( i = 0; i < 4; i++ ) {
2754
        if( tc0[i] < 0 ) {
2755
            pix += 4*ystride;
2756
            continue;
2757
        }
2758
        for( d = 0; d < 4; d++ ) {
2759
            const int p0 = pix[-1*xstride];
2760
            const int p1 = pix[-2*xstride];
2761
            const int p2 = pix[-3*xstride];
2762
            const int q0 = pix[0];
2763
            const int q1 = pix[1*xstride];
2764
            const int q2 = pix[2*xstride];
2765

    
2766
            if( ABS( p0 - q0 ) < alpha &&
2767
                ABS( p1 - p0 ) < beta &&
2768
                ABS( q1 - q0 ) < beta ) {
2769

    
2770
                int tc = tc0[i];
2771
                int i_delta;
2772

    
2773
                if( ABS( p2 - p0 ) < beta ) {
2774
                    pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2775
                    tc++;
2776
                }
2777
                if( ABS( q2 - q0 ) < beta ) {
2778
                    pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2779
                    tc++;
2780
                }
2781

    
2782
                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2783
                pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2784
                pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2785
            }
2786
            pix += ystride;
2787
        }
2788
    }
2789
}
2790
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2791
{
2792
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2793
}
2794
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2795
{
2796
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2797
}
2798

    
2799
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2800
{
2801
    int i, d;
2802
    for( i = 0; i < 4; i++ ) {
2803
        const int tc = tc0[i];
2804
        if( tc <= 0 ) {
2805
            pix += 2*ystride;
2806
            continue;
2807
        }
2808
        for( d = 0; d < 2; d++ ) {
2809
            const int p0 = pix[-1*xstride];
2810
            const int p1 = pix[-2*xstride];
2811
            const int q0 = pix[0];
2812
            const int q1 = pix[1*xstride];
2813

    
2814
            if( ABS( p0 - q0 ) < alpha &&
2815
                ABS( p1 - p0 ) < beta &&
2816
                ABS( q1 - q0 ) < beta ) {
2817

    
2818
                int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2819

    
2820
                pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2821
                pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2822
            }
2823
            pix += ystride;
2824
        }
2825
    }
2826
}
2827
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2828
{
2829
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2830
}
2831
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2832
{
2833
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2834
}
2835

    
2836
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2837
{
2838
    int d;
2839
    for( d = 0; d < 8; d++ ) {
2840
        const int p0 = pix[-1*xstride];
2841
        const int p1 = pix[-2*xstride];
2842
        const int q0 = pix[0];
2843
        const int q1 = pix[1*xstride];
2844

    
2845
        if( ABS( p0 - q0 ) < alpha &&
2846
            ABS( p1 - p0 ) < beta &&
2847
            ABS( q1 - q0 ) < beta ) {
2848

    
2849
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2850
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2851
        }
2852
        pix += ystride;
2853
    }
2854
}
2855
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2856
{
2857
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2858
}
2859
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2860
{
2861
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2862
}
2863

    
2864
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2865
{
2866
    int s, i;
2867

    
2868
    s = 0;
2869
    for(i=0;i<h;i++) {
2870
        s += abs(pix1[0] - pix2[0]);
2871
        s += abs(pix1[1] - pix2[1]);
2872
        s += abs(pix1[2] - pix2[2]);
2873
        s += abs(pix1[3] - pix2[3]);
2874
        s += abs(pix1[4] - pix2[4]);
2875
        s += abs(pix1[5] - pix2[5]);
2876
        s += abs(pix1[6] - pix2[6]);
2877
        s += abs(pix1[7] - pix2[7]);
2878
        s += abs(pix1[8] - pix2[8]);
2879
        s += abs(pix1[9] - pix2[9]);
2880
        s += abs(pix1[10] - pix2[10]);
2881
        s += abs(pix1[11] - pix2[11]);
2882
        s += abs(pix1[12] - pix2[12]);
2883
        s += abs(pix1[13] - pix2[13]);
2884
        s += abs(pix1[14] - pix2[14]);
2885
        s += abs(pix1[15] - pix2[15]);
2886
        pix1 += line_size;
2887
        pix2 += line_size;
2888
    }
2889
    return s;
2890
}
2891

    
2892
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2893
{
2894
    int s, i;
2895

    
2896
    s = 0;
2897
    for(i=0;i<h;i++) {
2898
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2899
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2900
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2901
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2902
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2903
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2904
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2905
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2906
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2907
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2908
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2909
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2910
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2911
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2912
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2913
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2914
        pix1 += line_size;
2915
        pix2 += line_size;
2916
    }
2917
    return s;
2918
}
2919

    
2920
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2921
{
2922
    int s, i;
2923
    uint8_t *pix3 = pix2 + line_size;
2924

    
2925
    s = 0;
2926
    for(i=0;i<h;i++) {
2927
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2928
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2929
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2930
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2931
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2932
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2933
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2934
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2935
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2936
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2937
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2938
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2939
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2940
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2941
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2942
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2943
        pix1 += line_size;
2944
        pix2 += line_size;
2945
        pix3 += line_size;
2946
    }
2947
    return s;
2948
}
2949

    
2950
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2951
{
2952
    int s, i;
2953
    uint8_t *pix3 = pix2 + line_size;
2954

    
2955
    s = 0;
2956
    for(i=0;i<h;i++) {
2957
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2958
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2959
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2960
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2961
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2962
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2963
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2964
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2965
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2966
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2967
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2968
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2969
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2970
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2971
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2972
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2973
        pix1 += line_size;
2974
        pix2 += line_size;
2975
        pix3 += line_size;
2976
    }
2977
    return s;
2978
}
2979

    
2980
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2981
{
2982
    int s, i;
2983

    
2984
    s = 0;
2985
    for(i=0;i<h;i++) {
2986
        s += abs(pix1[0] - pix2[0]);
2987
        s += abs(pix1[1] - pix2[1]);
2988
        s += abs(pix1[2] - pix2[2]);
2989
        s += abs(pix1[3] - pix2[3]);
2990
        s += abs(pix1[4] - pix2[4]);
2991
        s += abs(pix1[5] - pix2[5]);
2992
        s += abs(pix1[6] - pix2[6]);
2993
        s += abs(pix1[7] - pix2[7]);
2994
        pix1 += line_size;
2995
        pix2 += line_size;
2996
    }
2997
    return s;
2998
}
2999

    
3000
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3001
{
3002
    int s, i;
3003

    
3004
    s = 0;
3005
    for(i=0;i<h;i++) {
3006
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3007
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3008
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3009
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3010
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3011
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3012
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3013
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3014
        pix1 += line_size;
3015
        pix2 += line_size;
3016
    }
3017
    return s;
3018
}
3019

    
3020
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3021
{
3022
    int s, i;
3023
    uint8_t *pix3 = pix2 + line_size;
3024

    
3025
    s = 0;
3026
    for(i=0;i<h;i++) {
3027
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3028
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3029
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3030
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3031
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3032
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3033
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3034
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3035
        pix1 += line_size;
3036
        pix2 += line_size;
3037
        pix3 += line_size;
3038
    }
3039
    return s;
3040
}
3041

    
3042
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3043
{
3044
    int s, i;
3045
    uint8_t *pix3 = pix2 + line_size;
3046

    
3047
    s = 0;
3048
    for(i=0;i<h;i++) {
3049
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3050
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3051
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3052
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3053
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3054
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3055
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3056
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3057
        pix1 += line_size;
3058
        pix2 += line_size;
3059
        pix3 += line_size;
3060
    }
3061
    return s;
3062
}
3063

    
3064
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3065
    MpegEncContext *c = v;
3066
    int score1=0;
3067
    int score2=0;
3068
    int x,y;
3069

    
3070
    for(y=0; y<h; y++){
3071
        for(x=0; x<16; x++){
3072
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3073
        }
3074
        if(y+1<h){
3075
            for(x=0; x<15; x++){
3076
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3077
                             - s1[x+1] + s1[x+1+stride])
3078
                        -ABS(  s2[x  ] - s2[x  +stride]
3079
                             - s2[x+1] + s2[x+1+stride]);
3080
            }
3081
        }
3082
        s1+= stride;
3083
        s2+= stride;
3084
    }
3085

    
3086
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3087
    else  return score1 + ABS(score2)*8;
3088
}
3089

    
3090
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3091
    MpegEncContext *c = v;
3092
    int score1=0;
3093
    int score2=0;
3094
    int x,y;
3095

    
3096
    for(y=0; y<h; y++){
3097
        for(x=0; x<8; x++){
3098
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3099
        }
3100
        if(y+1<h){
3101
            for(x=0; x<7; x++){
3102
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3103
                             - s1[x+1] + s1[x+1+stride])
3104
                        -ABS(  s2[x  ] - s2[x  +stride]
3105
                             - s2[x+1] + s2[x+1+stride]);
3106
            }
3107
        }
3108
        s1+= stride;
3109
        s2+= stride;
3110
    }
3111

    
3112
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3113
    else  return score1 + ABS(score2)*8;
3114
}
3115

    
3116
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3117
    int i;
3118
    unsigned int sum=0;
3119

    
3120
    for(i=0; i<8*8; i++){
3121
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3122
        int w= weight[i];
3123
        b>>= RECON_SHIFT;
3124
        assert(-512<b && b<512);
3125

    
3126
        sum += (w*b)*(w*b)>>4;
3127
    }
3128
    return sum>>2;
3129
}
3130

    
3131
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3132
    int i;
3133

    
3134
    for(i=0; i<8*8; i++){
3135
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3136
    }
3137
}
3138

    
3139
/**
3140
 * permutes an 8x8 block.
3141
 * @param block the block which will be permuted according to the given permutation vector
3142
 * @param permutation the permutation vector
3143
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3144
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3145
 *                  (inverse) permutated to scantable order!
3146
 */
3147
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3148
{
3149
    int i;
3150
    DCTELEM temp[64];
3151

    
3152
    if(last<=0) return;
3153
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3154

    
3155
    for(i=0; i<=last; i++){
3156
        const int j= scantable[i];
3157
        temp[j]= block[j];
3158
        block[j]=0;
3159
    }
3160

    
3161
    for(i=0; i<=last; i++){
3162
        const int j= scantable[i];
3163
        const int perm_j= permutation[j];
3164
        block[perm_j]= temp[j];
3165
    }
3166
}
3167

    
3168
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3169
    return 0;
3170
}
3171

    
3172
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3173
    int i;
3174

    
3175
    memset(cmp, 0, sizeof(void*)*5);
3176

    
3177
    for(i=0; i<5; i++){
3178
        switch(type&0xFF){
3179
        case FF_CMP_SAD:
3180
            cmp[i]= c->sad[i];
3181
            break;
3182
        case FF_CMP_SATD:
3183
            cmp[i]= c->hadamard8_diff[i];
3184
            break;
3185
        case FF_CMP_SSE:
3186
            cmp[i]= c->sse[i];
3187
            break;
3188
        case FF_CMP_DCT:
3189
            cmp[i]= c->dct_sad[i];
3190
            break;
3191
        case FF_CMP_DCT264:
3192
            cmp[i]= c->dct264_sad[i];
3193
            break;
3194
        case FF_CMP_DCTMAX:
3195
            cmp[i]= c->dct_max[i];
3196
            break;
3197
        case FF_CMP_PSNR:
3198
            cmp[i]= c->quant_psnr[i];
3199
            break;
3200
        case FF_CMP_BIT:
3201
            cmp[i]= c->bit[i];
3202
            break;
3203
        case FF_CMP_RD:
3204
            cmp[i]= c->rd[i];
3205
            break;
3206
        case FF_CMP_VSAD:
3207
            cmp[i]= c->vsad[i];
3208
            break;
3209
        case FF_CMP_VSSE:
3210
            cmp[i]= c->vsse[i];
3211
            break;
3212
        case FF_CMP_ZERO:
3213
            cmp[i]= zero_cmp;
3214
            break;
3215
        case FF_CMP_NSSE:
3216
            cmp[i]= c->nsse[i];
3217
            break;
3218
        case FF_CMP_W53:
3219
            cmp[i]= c->w53[i];
3220
            break;
3221
        case FF_CMP_W97:
3222
            cmp[i]= c->w97[i];
3223
            break;
3224
        default:
3225
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3226
        }
3227
    }
3228
}
3229

    
3230
/**
3231
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3232
 */
3233
static void clear_blocks_c(DCTELEM *blocks)
3234
{
3235
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3236
}
3237

    
3238
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3239
    int i;
3240
    for(i=0; i+7<w; i+=8){
3241
        dst[i+0] += src[i+0];
3242
        dst[i+1] += src[i+1];
3243
        dst[i+2] += src[i+2];
3244
        dst[i+3] += src[i+3];
3245
        dst[i+4] += src[i+4];
3246
        dst[i+5] += src[i+5];
3247
        dst[i+6] += src[i+6];
3248
        dst[i+7] += src[i+7];
3249
    }
3250
    for(; i<w; i++)
3251
        dst[i+0] += src[i+0];
3252
}
3253

    
3254
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3255
    int i;
3256
    for(i=0; i+7<w; i+=8){
3257
        dst[i+0] = src1[i+0]-src2[i+0];
3258
        dst[i+1] = src1[i+1]-src2[i+1];
3259
        dst[i+2] = src1[i+2]-src2[i+2];
3260
        dst[i+3] = src1[i+3]-src2[i+3];
3261
        dst[i+4] = src1[i+4]-src2[i+4];
3262
        dst[i+5] = src1[i+5]-src2[i+5];
3263
        dst[i+6] = src1[i+6]-src2[i+6];
3264
        dst[i+7] = src1[i+7]-src2[i+7];
3265
    }
3266
    for(; i<w; i++)
3267
        dst[i+0] = src1[i+0]-src2[i+0];
3268
}
3269

    
3270
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3271
    int i;
3272
    uint8_t l, lt;
3273

    
3274
    l= *left;
3275
    lt= *left_top;
3276

    
3277
    for(i=0; i<w; i++){
3278
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3279
        lt= src1[i];
3280
        l= src2[i];
3281
        dst[i]= l - pred;
3282
    }
3283

    
3284
    *left= l;
3285
    *left_top= lt;
3286
}
3287

    
3288
#define BUTTERFLY2(o1,o2,i1,i2) \
3289
o1= (i1)+(i2);\
3290
o2= (i1)-(i2);
3291

    
3292
#define BUTTERFLY1(x,y) \
3293
{\
3294
    int a,b;\
3295
    a= x;\
3296
    b= y;\
3297
    x= a+b;\
3298
    y= a-b;\
3299
}
3300

    
3301
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3302

    
3303
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3304
    int i;
3305
    int temp[64];
3306
    int sum=0;
3307

    
3308
    assert(h==8);
3309

    
3310
    for(i=0; i<8; i++){
3311
        //FIXME try pointer walks
3312
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3313
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3314
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3315
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3316

    
3317
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3318
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3319
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3320
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3321

    
3322
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3323
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3324
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3325
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3326
    }
3327

    
3328
    for(i=0; i<8; i++){
3329
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3330
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3331
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3332
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3333

    
3334
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3335
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3336
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3337
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3338

    
3339
        sum +=
3340
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3341
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3342
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3343
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3344
    }
3345
#if 0
3346
static int maxi=0;
3347
if(sum>maxi){
3348
    maxi=sum;
3349
    printf("MAX:%d\n", maxi);
3350
}
3351
#endif
3352
    return sum;
3353
}
3354

    
3355
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3356
    int i;
3357
    int temp[64];
3358
    int sum=0;
3359

    
3360
    assert(h==8);
3361

    
3362
    for(i=0; i<8; i++){
3363
        //FIXME try pointer walks
3364
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3365
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3366
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3367
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3368

    
3369
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3370
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3371
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3372
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3373

    
3374
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3375
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3376
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3377
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3378
    }
3379

    
3380
    for(i=0; i<8; i++){
3381
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3382
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3383
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3384
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3385

    
3386
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3387
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3388
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3389
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3390

    
3391
        sum +=
3392
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3393
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3394
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3395
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3396
    }
3397

    
3398
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3399

    
3400
    return sum;
3401
}
3402

    
3403
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3404
    MpegEncContext * const s= (MpegEncContext *)c;
3405
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3406
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3407
    int sum=0, i;
3408

    
3409
    assert(h==8);
3410

    
3411
    s->dsp.diff_pixels(temp, src1, src2, stride);
3412
    s->dsp.fdct(temp);
3413

    
3414
    for(i=0; i<64; i++)
3415
        sum+= ABS(temp[i]);
3416

    
3417
    return sum;
3418
}
3419

    
3420
#ifdef CONFIG_GPL
3421
#define DCT8_1D {\
3422
    const int s07 = SRC(0) + SRC(7);\
3423
    const int s16 = SRC(1) + SRC(6);\
3424
    const int s25 = SRC(2) + SRC(5);\
3425
    const int s34 = SRC(3) + SRC(4);\
3426
    const int a0 = s07 + s34;\
3427
    const int a1 = s16 + s25;\
3428
    const int a2 = s07 - s34;\
3429
    const int a3 = s16 - s25;\
3430
    const int d07 = SRC(0) - SRC(7);\
3431
    const int d16 = SRC(1) - SRC(6);\
3432
    const int d25 = SRC(2) - SRC(5);\
3433
    const int d34 = SRC(3) - SRC(4);\
3434
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3435
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3436
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3437
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3438
    DST(0,  a0 + a1     ) ;\
3439
    DST(1,  a4 + (a7>>2)) ;\
3440
    DST(2,  a2 + (a3>>1)) ;\
3441
    DST(3,  a5 + (a6>>2)) ;\
3442
    DST(4,  a0 - a1     ) ;\
3443
    DST(5,  a6 - (a5>>2)) ;\
3444
    DST(6, (a2>>1) - a3 ) ;\
3445
    DST(7, (a4>>2) - a7 ) ;\
3446
}
3447

    
3448
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3449
    MpegEncContext * const s= (MpegEncContext *)c;
3450
    int16_t dct[8][8];
3451
    int i;
3452
    int sum=0;
3453

    
3454
    s->dsp.diff_pixels(dct, src1, src2, stride);
3455

    
3456
#define SRC(x) dct[i][x]
3457
#define DST(x,v) dct[i][x]= v
3458
    for( i = 0; i < 8; i++ )
3459
        DCT8_1D
3460
#undef SRC
3461
#undef DST
3462

    
3463
#define SRC(x) dct[x][i]
3464
#define DST(x,v) sum += ABS(v)
3465
    for( i = 0; i < 8; i++ )
3466
        DCT8_1D
3467
#undef SRC
3468
#undef DST
3469
    return sum;
3470
}
3471
#endif
3472

    
3473
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474
    MpegEncContext * const s= (MpegEncContext *)c;
3475
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3476
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3477
    int sum=0, i;
3478

    
3479
    assert(h==8);
3480

    
3481
    s->dsp.diff_pixels(temp, src1, src2, stride);
3482
    s->dsp.fdct(temp);
3483

    
3484
    for(i=0; i<64; i++)
3485
        sum= FFMAX(sum, ABS(temp[i]));
3486

    
3487
    return sum;
3488
}
3489

    
3490
void simple_idct(DCTELEM *block); //FIXME
3491

    
3492
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493
    MpegEncContext * const s= (MpegEncContext *)c;
3494
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3495
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3496
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3497
    int sum=0, i;
3498

    
3499
    assert(h==8);
3500
    s->mb_intra=0;
3501

    
3502
    s->dsp.diff_pixels(temp, src1, src2, stride);
3503

    
3504
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3505

    
3506
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3507
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3508
    simple_idct(temp); //FIXME
3509

    
3510
    for(i=0; i<64; i++)
3511
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3512

    
3513
    return sum;
3514
}
3515

    
3516
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3517
    MpegEncContext * const s= (MpegEncContext *)c;
3518
    const uint8_t *scantable= s->intra_scantable.permutated;
3519
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3520
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3521
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3522
    uint8_t * const bak= (uint8_t*)aligned_bak;
3523
    int i, last, run, bits, level, distoration, start_i;
3524
    const int esc_length= s->ac_esc_length;
3525
    uint8_t * length;
3526
    uint8_t * last_length;
3527

    
3528
    assert(h==8);
3529

    
3530
    for(i=0; i<8; i++){
3531
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3532
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3533
    }
3534

    
3535
    s->dsp.diff_pixels(temp, src1, src2, stride);
3536

    
3537
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3538

    
3539
    bits=0;
3540

    
3541
    if (s->mb_intra) {
3542
        start_i = 1;
3543
        length     = s->intra_ac_vlc_length;
3544
        last_length= s->intra_ac_vlc_last_length;
3545
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3546
    } else {
3547
        start_i = 0;
3548
        length     = s->inter_ac_vlc_length;
3549
        last_length= s->inter_ac_vlc_last_length;
3550
    }
3551

    
3552
    if(last>=start_i){
3553
        run=0;
3554
        for(i=start_i; i<last; i++){
3555
            int j= scantable[i];
3556
            level= temp[j];
3557

    
3558
            if(level){
3559
                level+=64;
3560
                if((level&(~127)) == 0){
3561
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3562
                }else
3563
                    bits+= esc_length;
3564
                run=0;
3565
            }else
3566
                run++;
3567
        }
3568
        i= scantable[last];
3569

    
3570
        level= temp[i] + 64;
3571

    
3572
        assert(level - 64);
3573

    
3574
        if((level&(~127)) == 0){
3575
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3576
        }else
3577
            bits+= esc_length;
3578

    
3579
    }
3580

    
3581
    if(last>=0){
3582
        if(s->mb_intra)
3583
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3584
        else
3585
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3586
    }
3587

    
3588
    s->dsp.idct_add(bak, stride, temp);
3589

    
3590
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3591

    
3592
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3593
}
3594

    
3595
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3596
    MpegEncContext * const s= (MpegEncContext *)c;
3597
    const uint8_t *scantable= s->intra_scantable.permutated;
3598
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3599
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3600
    int i, last, run, bits, level, start_i;
3601
    const int esc_length= s->ac_esc_length;
3602
    uint8_t * length;
3603
    uint8_t * last_length;
3604

    
3605
    assert(h==8);
3606

    
3607
    s->dsp.diff_pixels(temp, src1, src2, stride);
3608

    
3609
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3610

    
3611
    bits=0;
3612

    
3613
    if (s->mb_intra) {
3614
        start_i = 1;
3615
        length     = s->intra_ac_vlc_length;
3616
        last_length= s->intra_ac_vlc_last_length;
3617
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3618
    } else {
3619
        start_i = 0;
3620
        length     = s->inter_ac_vlc_length;
3621
        last_length= s->inter_ac_vlc_last_length;
3622
    }
3623

    
3624
    if(last>=start_i){
3625
        run=0;
3626
        for(i=start_i; i<last; i++){
3627
            int j= scantable[i];
3628
            level= temp[j];
3629

    
3630
            if(level){
3631
                level+=64;
3632
                if((level&(~127)) == 0){
3633
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3634
                }else
3635
                    bits+= esc_length;
3636
                run=0;
3637
            }else
3638
                run++;
3639
        }
3640
        i= scantable[last];
3641

    
3642
        level= temp[i] + 64;
3643

    
3644
        assert(level - 64);
3645

    
3646
        if((level&(~127)) == 0){
3647
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3648
        }else
3649
            bits+= esc_length;
3650
    }
3651

    
3652
    return bits;
3653
}
3654

    
3655
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3656
    int score=0;
3657
    int x,y;
3658

    
3659
    for(y=1; y<h; y++){
3660
        for(x=0; x<16; x+=4){
3661
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3662
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3663
        }
3664
        s+= stride;
3665
    }
3666

    
3667
    return score;
3668
}
3669

    
3670
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3671
    int score=0;
3672
    int x,y;
3673

    
3674
    for(y=1; y<h; y++){
3675
        for(x=0; x<16; x++){
3676
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3677
        }
3678
        s1+= stride;
3679
        s2+= stride;
3680
    }
3681

    
3682
    return score;
3683
}
3684

    
3685
#define SQ(a) ((a)*(a))
3686
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3687
    int score=0;
3688
    int x,y;
3689

    
3690
    for(y=1; y<h; y++){
3691
        for(x=0; x<16; x+=4){
3692
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3693
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3694
        }
3695
        s+= stride;
3696
    }
3697

    
3698
    return score;
3699
}
3700

    
3701
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3702
    int score=0;
3703
    int x,y;
3704

    
3705
    for(y=1; y<h; y++){
3706
        for(x=0; x<16; x++){
3707
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3708
        }
3709
        s1+= stride;
3710
        s2+= stride;
3711
    }
3712

    
3713
    return score;
3714
}
3715

    
3716
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3717
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3718
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3719
#ifdef CONFIG_GPL
3720
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3721
#endif
3722
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3723
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3724
WARPER8_16_SQ(rd8x8_c, rd16_c)
3725
WARPER8_16_SQ(bit8x8_c, bit16_c)
3726

    
3727
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3728
 converted */
3729
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3730
{
3731
    j_rev_dct (block);
3732
    put_pixels_clamped_c(block, dest, line_size);
3733
}
3734
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3735
{
3736
    j_rev_dct (block);
3737
    add_pixels_clamped_c(block, dest, line_size);
3738
}
3739

    
3740
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3741
{
3742
    j_rev_dct4 (block);
3743
    put_pixels_clamped4_c(block, dest, line_size);
3744
}
3745
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3746
{
3747
    j_rev_dct4 (block);
3748
    add_pixels_clamped4_c(block, dest, line_size);
3749
}
3750

    
3751
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3752
{
3753
    j_rev_dct2 (block);
3754
    put_pixels_clamped2_c(block, dest, line_size);
3755
}
3756
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3757
{
3758
    j_rev_dct2 (block);
3759
    add_pixels_clamped2_c(block, dest, line_size);
3760
}
3761

    
3762
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3763
{
3764
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3765

    
3766
    dest[0] = cm[(block[0] + 4)>>3];
3767
}
3768
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3769
{
3770
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3771

    
3772
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3773
}
3774

    
3775
/* init static data */
3776
void dsputil_static_init(void)
3777
{
3778
    int i;
3779

    
3780
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3781
    for(i=0;i<MAX_NEG_CROP;i++) {
3782
        cropTbl[i] = 0;
3783
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3784
    }
3785

    
3786
    for(i=0;i<512;i++) {
3787
        squareTbl[i] = (i - 256) * (i - 256);
3788
    }
3789

    
3790
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3791
}
3792

    
3793

    
3794
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3795
{
3796
    int i;
3797

    
3798
#ifdef CONFIG_ENCODERS
3799
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3800
        c->fdct = fdct_ifast;
3801
        c->fdct248 = fdct_ifast248;
3802
    }
3803
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3804
        c->fdct = ff_faandct;
3805
        c->fdct248 = ff_faandct248;
3806
    }
3807
    else {
3808
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3809
        c->fdct248 = ff_fdct248_islow;
3810
    }
3811
#endif //CONFIG_ENCODERS
3812

    
3813
    if(avctx->lowres==1){
3814
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3815
            c->idct_put= ff_jref_idct4_put;
3816
            c->idct_add= ff_jref_idct4_add;
3817
        }else{
3818
            c->idct_put= ff_h264_lowres_idct_put_c;
3819
            c->idct_add= ff_h264_lowres_idct_add_c;
3820
        }
3821
        c->idct    = j_rev_dct4;
3822
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3823
    }else if(avctx->lowres==2){
3824
        c->idct_put= ff_jref_idct2_put;
3825
        c->idct_add= ff_jref_idct2_add;
3826
        c->idct    = j_rev_dct2;
3827
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3828
    }else if(avctx->lowres==3){
3829
        c->idct_put= ff_jref_idct1_put;
3830
        c->idct_add= ff_jref_idct1_add;
3831
        c->idct    = j_rev_dct1;
3832
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3833
    }else{
3834
        if(avctx->idct_algo==FF_IDCT_INT){
3835
            c->idct_put= ff_jref_idct_put;
3836
            c->idct_add= ff_jref_idct_add;
3837
            c->idct    = j_rev_dct;
3838
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3839
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3840
            c->idct_put= ff_vp3_idct_put_c;
3841
            c->idct_add= ff_vp3_idct_add_c;
3842
            c->idct    = ff_vp3_idct_c;
3843
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3844
        }else{ //accurate/default
3845
            c->idct_put= simple_idct_put;
3846
            c->idct_add= simple_idct_add;
3847
            c->idct    = simple_idct;
3848
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3849
        }
3850
    }
3851

    
3852
    c->h264_idct_add= ff_h264_idct_add_c;
3853
    c->h264_idct8_add= ff_h264_idct8_add_c;
3854

    
3855
    c->get_pixels = get_pixels_c;
3856
    c->diff_pixels = diff_pixels_c;
3857
    c->put_pixels_clamped = put_pixels_clamped_c;
3858
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3859
    c->add_pixels_clamped = add_pixels_clamped_c;
3860
    c->add_pixels8 = add_pixels8_c;
3861
    c->add_pixels4 = add_pixels4_c;
3862
    c->gmc1 = gmc1_c;
3863
    c->gmc = gmc_c;
3864
    c->clear_blocks = clear_blocks_c;
3865
    c->pix_sum = pix_sum_c;
3866
    c->pix_norm1 = pix_norm1_c;
3867

    
3868
    /* TODO [0] 16  [1] 8 */
3869
    c->pix_abs[0][0] = pix_abs16_c;
3870
    c->pix_abs[0][1] = pix_abs16_x2_c;
3871
    c->pix_abs[0][2] = pix_abs16_y2_c;
3872
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3873
    c->pix_abs[1][0] = pix_abs8_c;
3874
    c->pix_abs[1][1] = pix_abs8_x2_c;
3875
    c->pix_abs[1][2] = pix_abs8_y2_c;
3876
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3877

    
3878
#define dspfunc(PFX, IDX, NUM) \
3879
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3880
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3881
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3882
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3883

    
3884
    dspfunc(put, 0, 16);
3885
    dspfunc(put_no_rnd, 0, 16);
3886
    dspfunc(put, 1, 8);
3887
    dspfunc(put_no_rnd, 1, 8);
3888
    dspfunc(put, 2, 4);
3889
    dspfunc(put, 3, 2);
3890

    
3891
    dspfunc(avg, 0, 16);
3892
    dspfunc(avg_no_rnd, 0, 16);
3893
    dspfunc(avg, 1, 8);
3894
    dspfunc(avg_no_rnd, 1, 8);
3895
    dspfunc(avg, 2, 4);
3896
    dspfunc(avg, 3, 2);
3897
#undef dspfunc
3898

    
3899
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3900
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3901

    
3902
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3903
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3904
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3905
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3906
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3907
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3908
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3909
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3910
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3911

    
3912
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3913
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3914
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3915
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3916
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3917
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3918
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3919
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3920
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3921

    
3922
#define dspfunc(PFX, IDX, NUM) \
3923
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3924
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3925
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3926
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3927
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3928
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3929
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3930
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3931
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3932
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3933
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3934
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3935
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3936
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3937
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3938
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3939

    
3940
    dspfunc(put_qpel, 0, 16);
3941
    dspfunc(put_no_rnd_qpel, 0, 16);
3942

    
3943
    dspfunc(avg_qpel, 0, 16);
3944
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3945

    
3946
    dspfunc(put_qpel, 1, 8);
3947
    dspfunc(put_no_rnd_qpel, 1, 8);
3948

    
3949
    dspfunc(avg_qpel, 1, 8);
3950
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3951

    
3952
    dspfunc(put_h264_qpel, 0, 16);
3953
    dspfunc(put_h264_qpel, 1, 8);
3954
    dspfunc(put_h264_qpel, 2, 4);
3955
    dspfunc(put_h264_qpel, 3, 2);
3956
    dspfunc(avg_h264_qpel, 0, 16);
3957
    dspfunc(avg_h264_qpel, 1, 8);
3958
    dspfunc(avg_h264_qpel, 2, 4);
3959

    
3960
#undef dspfunc
3961
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3962
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3963
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3964
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3965
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3966
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3967

    
3968
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3969
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3970
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3971
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3972
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3973
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3974
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3975
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3976
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3977
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3978
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3979
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3980
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3981
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3982
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3983
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3984
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3985
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3986
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3987
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3988

    
3989
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3990
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3991
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3992
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3993
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3994
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3995
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3996
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3997

    
3998
#define SET_CMP_FUNC(name) \
3999
    c->name[0]= name ## 16_c;\
4000
    c->name[1]= name ## 8x8_c;
4001

    
4002
    SET_CMP_FUNC(hadamard8_diff)
4003
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4004
    SET_CMP_FUNC(dct_sad)
4005
    SET_CMP_FUNC(dct_max)
4006
#ifdef CONFIG_GPL
4007
    SET_CMP_FUNC(dct264_sad)
4008
#endif
4009
    c->sad[0]= pix_abs16_c;
4010
    c->sad[1]= pix_abs8_c;
4011
    c->sse[0]= sse16_c;
4012
    c->sse[1]= sse8_c;
4013
    c->sse[2]= sse4_c;
4014
    SET_CMP_FUNC(quant_psnr)
4015
    SET_CMP_FUNC(rd)
4016
    SET_CMP_FUNC(bit)
4017
    c->vsad[0]= vsad16_c;
4018
    c->vsad[4]= vsad_intra16_c;
4019
    c->vsse[0]= vsse16_c;
4020
    c->vsse[4]= vsse_intra16_c;
4021
    c->nsse[0]= nsse16_c;
4022
    c->nsse[1]= nsse8_c;
4023
    c->w53[0]= w53_16_c;
4024
    c->w53[1]= w53_8_c;
4025
    c->w97[0]= w97_16_c;
4026
    c->w97[1]= w97_8_c;
4027

    
4028
    c->add_bytes= add_bytes_c;
4029
    c->diff_bytes= diff_bytes_c;
4030
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4031
    c->bswap_buf= bswap_buf;
4032

    
4033
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4034
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4035
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4036
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4037
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4038
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4039

    
4040
    c->h263_h_loop_filter= h263_h_loop_filter_c;
4041
    c->h263_v_loop_filter= h263_v_loop_filter_c;
4042

    
4043
    c->h261_loop_filter= h261_loop_filter_c;
4044

    
4045
    c->try_8x8basis= try_8x8basis_c;
4046
    c->add_8x8basis= add_8x8basis_c;
4047

    
4048
#ifdef HAVE_MMX
4049
    dsputil_init_mmx(c, avctx);
4050
#endif
4051
#ifdef ARCH_ARMV4L
4052
    dsputil_init_armv4l(c, avctx);
4053
#endif
4054
#ifdef HAVE_MLIB
4055
    dsputil_init_mlib(c, avctx);
4056
#endif
4057
#ifdef ARCH_SPARC
4058
   dsputil_init_vis(c,avctx);
4059
#endif
4060
#ifdef ARCH_ALPHA
4061
    dsputil_init_alpha(c, avctx);
4062
#endif
4063
#ifdef ARCH_POWERPC
4064
    dsputil_init_ppc(c, avctx);
4065
#endif
4066
#ifdef HAVE_MMI
4067
    dsputil_init_mmi(c, avctx);
4068
#endif
4069
#ifdef ARCH_SH4
4070
    dsputil_init_sh4(c,avctx);
4071
#endif
4072

    
4073
    switch(c->idct_permutation_type){
4074
    case FF_NO_IDCT_PERM:
4075
        for(i=0; i<64; i++)
4076
            c->idct_permutation[i]= i;
4077
        break;
4078
    case FF_LIBMPEG2_IDCT_PERM:
4079
        for(i=0; i<64; i++)
4080
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4081
        break;
4082
    case FF_SIMPLE_IDCT_PERM:
4083
        for(i=0; i<64; i++)
4084
            c->idct_permutation[i]= simple_mmx_permutation[i];
4085
        break;
4086
    case FF_TRANSPOSE_IDCT_PERM:
4087
        for(i=0; i<64; i++)
4088
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4089
        break;
4090
    case FF_PARTTRANS_IDCT_PERM:
4091
        for(i=0; i<64; i++)
4092
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4093
        break;
4094
    default:
4095
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4096
    }
4097
}
4098