Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 513fbd8e

History | View | Annotate | Download (144 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22

    
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27

    
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33
#include "snow.h"
34

    
35
/* snow.c */
36
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37

    
38
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39
uint32_t squareTbl[512] = {0, };
40

    
41
const uint8_t ff_zigzag_direct[64] = {
42
    0,   1,  8, 16,  9,  2,  3, 10,
43
    17, 24, 32, 25, 18, 11,  4,  5,
44
    12, 19, 26, 33, 40, 48, 41, 34,
45
    27, 20, 13,  6,  7, 14, 21, 28,
46
    35, 42, 49, 56, 57, 50, 43, 36,
47
    29, 22, 15, 23, 30, 37, 44, 51,
48
    58, 59, 52, 45, 38, 31, 39, 46,
49
    53, 60, 61, 54, 47, 55, 62, 63
50
};
51

    
52
/* Specific zigzag scan for 248 idct. NOTE that unlike the
53
   specification, we interleave the fields */
54
const uint8_t ff_zigzag248_direct[64] = {
55
     0,  8,  1,  9, 16, 24,  2, 10,
56
    17, 25, 32, 40, 48, 56, 33, 41,
57
    18, 26,  3, 11,  4, 12, 19, 27,
58
    34, 42, 49, 57, 50, 58, 35, 43,
59
    20, 28,  5, 13,  6, 14, 21, 29,
60
    36, 44, 51, 59, 52, 60, 37, 45,
61
    22, 30,  7, 15, 23, 31, 38, 46,
62
    53, 61, 54, 62, 39, 47, 55, 63,
63
};
64

    
65
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
67

    
68
const uint8_t ff_alternate_horizontal_scan[64] = {
69
    0,  1,   2,  3,  8,  9, 16, 17,
70
    10, 11,  4,  5,  6,  7, 15, 14,
71
    13, 12, 19, 18, 24, 25, 32, 33,
72
    26, 27, 20, 21, 22, 23, 28, 29,
73
    30, 31, 34, 35, 40, 41, 48, 49,
74
    42, 43, 36, 37, 38, 39, 44, 45,
75
    46, 47, 50, 51, 56, 57, 58, 59,
76
    52, 53, 54, 55, 60, 61, 62, 63,
77
};
78

    
79
const uint8_t ff_alternate_vertical_scan[64] = {
80
    0,  8,  16, 24,  1,  9,  2, 10,
81
    17, 25, 32, 40, 48, 56, 57, 49,
82
    41, 33, 26, 18,  3, 11,  4, 12,
83
    19, 27, 34, 42, 50, 58, 35, 43,
84
    51, 59, 20, 28,  5, 13,  6, 14,
85
    21, 29, 36, 44, 52, 60, 37, 45,
86
    53, 61, 22, 30,  7, 15, 23, 31,
87
    38, 46, 54, 62, 39, 47, 55, 63,
88
};
89

    
90
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
91
const uint32_t inverse[256]={
92
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
93
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
94
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
95
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
96
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
97
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
98
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
99
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
100
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
101
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
102
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
103
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
104
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
105
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
106
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
107
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
108
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
109
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
110
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
111
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
112
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
113
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
114
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
115
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
116
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
117
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
118
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
119
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
120
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
121
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
122
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
123
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
124
};
125

    
126
/* Input permutation for the simple_idct_mmx */
127
static const uint8_t simple_mmx_permutation[64]={
128
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
136
};
137

    
138
static int pix_sum_c(uint8_t * pix, int line_size)
139
{
140
    int s, i, j;
141

    
142
    s = 0;
143
    for (i = 0; i < 16; i++) {
144
        for (j = 0; j < 16; j += 8) {
145
            s += pix[0];
146
            s += pix[1];
147
            s += pix[2];
148
            s += pix[3];
149
            s += pix[4];
150
            s += pix[5];
151
            s += pix[6];
152
            s += pix[7];
153
            pix += 8;
154
        }
155
        pix += line_size - 16;
156
    }
157
    return s;
158
}
159

    
160
static int pix_norm1_c(uint8_t * pix, int line_size)
161
{
162
    int s, i, j;
163
    uint32_t *sq = squareTbl + 256;
164

    
165
    s = 0;
166
    for (i = 0; i < 16; i++) {
167
        for (j = 0; j < 16; j += 8) {
168
#if 0
169
            s += sq[pix[0]];
170
            s += sq[pix[1]];
171
            s += sq[pix[2]];
172
            s += sq[pix[3]];
173
            s += sq[pix[4]];
174
            s += sq[pix[5]];
175
            s += sq[pix[6]];
176
            s += sq[pix[7]];
177
#else
178
#if LONG_MAX > 2147483647
179
            register uint64_t x=*(uint64_t*)pix;
180
            s += sq[x&0xff];
181
            s += sq[(x>>8)&0xff];
182
            s += sq[(x>>16)&0xff];
183
            s += sq[(x>>24)&0xff];
184
            s += sq[(x>>32)&0xff];
185
            s += sq[(x>>40)&0xff];
186
            s += sq[(x>>48)&0xff];
187
            s += sq[(x>>56)&0xff];
188
#else
189
            register uint32_t x=*(uint32_t*)pix;
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
            x=*(uint32_t*)(pix+4);
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
#endif
200
#endif
201
            pix += 8;
202
        }
203
        pix += line_size - 16;
204
    }
205
    return s;
206
}
207

    
208
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
209
    int i;
210

    
211
    for(i=0; i+8<=w; i+=8){
212
        dst[i+0]= bswap_32(src[i+0]);
213
        dst[i+1]= bswap_32(src[i+1]);
214
        dst[i+2]= bswap_32(src[i+2]);
215
        dst[i+3]= bswap_32(src[i+3]);
216
        dst[i+4]= bswap_32(src[i+4]);
217
        dst[i+5]= bswap_32(src[i+5]);
218
        dst[i+6]= bswap_32(src[i+6]);
219
        dst[i+7]= bswap_32(src[i+7]);
220
    }
221
    for(;i<w; i++){
222
        dst[i+0]= bswap_32(src[i+0]);
223
    }
224
}
225

    
226
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227
{
228
    int s, i;
229
    uint32_t *sq = squareTbl + 256;
230

    
231
    s = 0;
232
    for (i = 0; i < h; i++) {
233
        s += sq[pix1[0] - pix2[0]];
234
        s += sq[pix1[1] - pix2[1]];
235
        s += sq[pix1[2] - pix2[2]];
236
        s += sq[pix1[3] - pix2[3]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242

    
243
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[0] - pix2[0]];
251
        s += sq[pix1[1] - pix2[1]];
252
        s += sq[pix1[2] - pix2[2]];
253
        s += sq[pix1[3] - pix2[3]];
254
        s += sq[pix1[4] - pix2[4]];
255
        s += sq[pix1[5] - pix2[5]];
256
        s += sq[pix1[6] - pix2[6]];
257
        s += sq[pix1[7] - pix2[7]];
258
        pix1 += line_size;
259
        pix2 += line_size;
260
    }
261
    return s;
262
}
263

    
264
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
265
{
266
    int s, i;
267
    uint32_t *sq = squareTbl + 256;
268

    
269
    s = 0;
270
    for (i = 0; i < h; i++) {
271
        s += sq[pix1[ 0] - pix2[ 0]];
272
        s += sq[pix1[ 1] - pix2[ 1]];
273
        s += sq[pix1[ 2] - pix2[ 2]];
274
        s += sq[pix1[ 3] - pix2[ 3]];
275
        s += sq[pix1[ 4] - pix2[ 4]];
276
        s += sq[pix1[ 5] - pix2[ 5]];
277
        s += sq[pix1[ 6] - pix2[ 6]];
278
        s += sq[pix1[ 7] - pix2[ 7]];
279
        s += sq[pix1[ 8] - pix2[ 8]];
280
        s += sq[pix1[ 9] - pix2[ 9]];
281
        s += sq[pix1[10] - pix2[10]];
282
        s += sq[pix1[11] - pix2[11]];
283
        s += sq[pix1[12] - pix2[12]];
284
        s += sq[pix1[13] - pix2[13]];
285
        s += sq[pix1[14] - pix2[14]];
286
        s += sq[pix1[15] - pix2[15]];
287

    
288
        pix1 += line_size;
289
        pix2 += line_size;
290
    }
291
    return s;
292
}
293

    
294

    
295
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
296
#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
297
    int s, i, j;
298
    const int dec_count= w==8 ? 3 : 4;
299
    int tmp[16*16];
300
#if 0
301
    int level, ori;
302
    static const int scale[2][2][4][4]={
303
      {
304
        {
305
            //8x8 dec=3
306
            {268, 239, 239, 213},
307
            {  0, 224, 224, 152},
308
            {  0, 135, 135, 110},
309
        },{
310
            //16x16 dec=4
311
            {344, 310, 310, 280},
312
            {  0, 320, 320, 228},
313
            {  0, 175, 175, 136},
314
            {  0, 129, 129, 102},
315
        }
316
      },{
317
        {//FIXME 5/3
318
            //8x8 dec=3
319
            {275, 245, 245, 218},
320
            {  0, 230, 230, 156},
321
            {  0, 138, 138, 113},
322
        },{
323
            //16x16 dec=4
324
            {352, 317, 317, 286},
325
            {  0, 328, 328, 233},
326
            {  0, 180, 180, 140},
327
            {  0, 132, 132, 105},
328
        }
329
      }
330
    };
331
#endif
332

    
333
    for (i = 0; i < h; i++) {
334
        for (j = 0; j < w; j+=4) {
335
            tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
336
            tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
337
            tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
338
            tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
339
        }
340
        pix1 += line_size;
341
        pix2 += line_size;
342
    }
343

    
344
    ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
345

    
346
    s=0;
347
#if 0
348
    for(level=0; level<dec_count; level++){
349
        for(ori= level ? 1 : 0; ori<4; ori++){
350
            int sx= (ori&1) ? 1<<level: 0;
351
            int stride= 16<<(dec_count-level);
352
            int sy= (ori&2) ? stride>>1 : 0;
353
            int size= 1<<level;
354

355
            for(i=0; i<size; i++){
356
                for(j=0; j<size; j++){
357
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
358
                    s += ABS(v);
359
                }
360
            }
361
        }
362
    }
363
#endif
364
    for (i = 0; i < h; i++) {
365
        for (j = 0; j < w; j+=4) {
366
            s+= ABS(tmp[16*i+j+0]);
367
            s+= ABS(tmp[16*i+j+1]);
368
            s+= ABS(tmp[16*i+j+2]);
369
            s+= ABS(tmp[16*i+j+3]);
370
        }
371
    }
372
    assert(s>=0);
373

    
374
    return s>>2;
375
#endif
376
}
377

    
378
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
380
}
381

    
382
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
384
}
385

    
386
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
388
}
389

    
390
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
392
}
393

    
394
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
395
{
396
    int i;
397

    
398
    /* read the pixels */
399
    for(i=0;i<8;i++) {
400
        block[0] = pixels[0];
401
        block[1] = pixels[1];
402
        block[2] = pixels[2];
403
        block[3] = pixels[3];
404
        block[4] = pixels[4];
405
        block[5] = pixels[5];
406
        block[6] = pixels[6];
407
        block[7] = pixels[7];
408
        pixels += line_size;
409
        block += 8;
410
    }
411
}
412

    
413
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414
                          const uint8_t *s2, int stride){
415
    int i;
416

    
417
    /* read the pixels */
418
    for(i=0;i<8;i++) {
419
        block[0] = s1[0] - s2[0];
420
        block[1] = s1[1] - s2[1];
421
        block[2] = s1[2] - s2[2];
422
        block[3] = s1[3] - s2[3];
423
        block[4] = s1[4] - s2[4];
424
        block[5] = s1[5] - s2[5];
425
        block[6] = s1[6] - s2[6];
426
        block[7] = s1[7] - s2[7];
427
        s1 += stride;
428
        s2 += stride;
429
        block += 8;
430
    }
431
}
432

    
433

    
434
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
435
                                 int line_size)
436
{
437
    int i;
438
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
439

    
440
    /* read the pixels */
441
    for(i=0;i<8;i++) {
442
        pixels[0] = cm[block[0]];
443
        pixels[1] = cm[block[1]];
444
        pixels[2] = cm[block[2]];
445
        pixels[3] = cm[block[3]];
446
        pixels[4] = cm[block[4]];
447
        pixels[5] = cm[block[5]];
448
        pixels[6] = cm[block[6]];
449
        pixels[7] = cm[block[7]];
450

    
451
        pixels += line_size;
452
        block += 8;
453
    }
454
}
455

    
456
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
457
                                 int line_size)
458
{
459
    int i;
460
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
461

    
462
    /* read the pixels */
463
    for(i=0;i<4;i++) {
464
        pixels[0] = cm[block[0]];
465
        pixels[1] = cm[block[1]];
466
        pixels[2] = cm[block[2]];
467
        pixels[3] = cm[block[3]];
468

    
469
        pixels += line_size;
470
        block += 8;
471
    }
472
}
473

    
474
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
475
                                 int line_size)
476
{
477
    int i;
478
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
479

    
480
    /* read the pixels */
481
    for(i=0;i<2;i++) {
482
        pixels[0] = cm[block[0]];
483
        pixels[1] = cm[block[1]];
484

    
485
        pixels += line_size;
486
        block += 8;
487
    }
488
}
489

    
490
static void put_signed_pixels_clamped_c(const DCTELEM *block,
491
                                        uint8_t *restrict pixels,
492
                                        int line_size)
493
{
494
    int i, j;
495

    
496
    for (i = 0; i < 8; i++) {
497
        for (j = 0; j < 8; j++) {
498
            if (*block < -128)
499
                *pixels = 0;
500
            else if (*block > 127)
501
                *pixels = 255;
502
            else
503
                *pixels = (uint8_t)(*block + 128);
504
            block++;
505
            pixels++;
506
        }
507
        pixels += (line_size - 8);
508
    }
509
}
510

    
511
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
512
                          int line_size)
513
{
514
    int i;
515
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
516

    
517
    /* read the pixels */
518
    for(i=0;i<8;i++) {
519
        pixels[0] = cm[pixels[0] + block[0]];
520
        pixels[1] = cm[pixels[1] + block[1]];
521
        pixels[2] = cm[pixels[2] + block[2]];
522
        pixels[3] = cm[pixels[3] + block[3]];
523
        pixels[4] = cm[pixels[4] + block[4]];
524
        pixels[5] = cm[pixels[5] + block[5]];
525
        pixels[6] = cm[pixels[6] + block[6]];
526
        pixels[7] = cm[pixels[7] + block[7]];
527
        pixels += line_size;
528
        block += 8;
529
    }
530
}
531

    
532
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
533
                          int line_size)
534
{
535
    int i;
536
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
537

    
538
    /* read the pixels */
539
    for(i=0;i<4;i++) {
540
        pixels[0] = cm[pixels[0] + block[0]];
541
        pixels[1] = cm[pixels[1] + block[1]];
542
        pixels[2] = cm[pixels[2] + block[2]];
543
        pixels[3] = cm[pixels[3] + block[3]];
544
        pixels += line_size;
545
        block += 8;
546
    }
547
}
548

    
549
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
550
                          int line_size)
551
{
552
    int i;
553
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
554

    
555
    /* read the pixels */
556
    for(i=0;i<2;i++) {
557
        pixels[0] = cm[pixels[0] + block[0]];
558
        pixels[1] = cm[pixels[1] + block[1]];
559
        pixels += line_size;
560
        block += 8;
561
    }
562
}
563

    
564
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
565
{
566
    int i;
567
    for(i=0;i<8;i++) {
568
        pixels[0] += block[0];
569
        pixels[1] += block[1];
570
        pixels[2] += block[2];
571
        pixels[3] += block[3];
572
        pixels[4] += block[4];
573
        pixels[5] += block[5];
574
        pixels[6] += block[6];
575
        pixels[7] += block[7];
576
        pixels += line_size;
577
        block += 8;
578
    }
579
}
580

    
581
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
582
{
583
    int i;
584
    for(i=0;i<4;i++) {
585
        pixels[0] += block[0];
586
        pixels[1] += block[1];
587
        pixels[2] += block[2];
588
        pixels[3] += block[3];
589
        pixels += line_size;
590
        block += 4;
591
    }
592
}
593

    
594
#if 0
595

596
#define PIXOP2(OPNAME, OP) \
597
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
598
{\
599
    int i;\
600
    for(i=0; i<h; i++){\
601
        OP(*((uint64_t*)block), LD64(pixels));\
602
        pixels+=line_size;\
603
        block +=line_size;\
604
    }\
605
}\
606
\
607
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
608
{\
609
    int i;\
610
    for(i=0; i<h; i++){\
611
        const uint64_t a= LD64(pixels  );\
612
        const uint64_t b= LD64(pixels+1);\
613
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
614
        pixels+=line_size;\
615
        block +=line_size;\
616
    }\
617
}\
618
\
619
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
620
{\
621
    int i;\
622
    for(i=0; i<h; i++){\
623
        const uint64_t a= LD64(pixels  );\
624
        const uint64_t b= LD64(pixels+1);\
625
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
626
        pixels+=line_size;\
627
        block +=line_size;\
628
    }\
629
}\
630
\
631
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
632
{\
633
    int i;\
634
    for(i=0; i<h; i++){\
635
        const uint64_t a= LD64(pixels          );\
636
        const uint64_t b= LD64(pixels+line_size);\
637
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
638
        pixels+=line_size;\
639
        block +=line_size;\
640
    }\
641
}\
642
\
643
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
644
{\
645
    int i;\
646
    for(i=0; i<h; i++){\
647
        const uint64_t a= LD64(pixels          );\
648
        const uint64_t b= LD64(pixels+line_size);\
649
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
650
        pixels+=line_size;\
651
        block +=line_size;\
652
    }\
653
}\
654
\
655
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
656
{\
657
        int i;\
658
        const uint64_t a= LD64(pixels  );\
659
        const uint64_t b= LD64(pixels+1);\
660
        uint64_t l0=  (a&0x0303030303030303ULL)\
661
                    + (b&0x0303030303030303ULL)\
662
                    + 0x0202020202020202ULL;\
663
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
664
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
665
        uint64_t l1,h1;\
666
\
667
        pixels+=line_size;\
668
        for(i=0; i<h; i+=2){\
669
            uint64_t a= LD64(pixels  );\
670
            uint64_t b= LD64(pixels+1);\
671
            l1=  (a&0x0303030303030303ULL)\
672
               + (b&0x0303030303030303ULL);\
673
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
675
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
676
            pixels+=line_size;\
677
            block +=line_size;\
678
            a= LD64(pixels  );\
679
            b= LD64(pixels+1);\
680
            l0=  (a&0x0303030303030303ULL)\
681
               + (b&0x0303030303030303ULL)\
682
               + 0x0202020202020202ULL;\
683
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
686
            pixels+=line_size;\
687
            block +=line_size;\
688
        }\
689
}\
690
\
691
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
692
{\
693
        int i;\
694
        const uint64_t a= LD64(pixels  );\
695
        const uint64_t b= LD64(pixels+1);\
696
        uint64_t l0=  (a&0x0303030303030303ULL)\
697
                    + (b&0x0303030303030303ULL)\
698
                    + 0x0101010101010101ULL;\
699
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
700
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
701
        uint64_t l1,h1;\
702
\
703
        pixels+=line_size;\
704
        for(i=0; i<h; i+=2){\
705
            uint64_t a= LD64(pixels  );\
706
            uint64_t b= LD64(pixels+1);\
707
            l1=  (a&0x0303030303030303ULL)\
708
               + (b&0x0303030303030303ULL);\
709
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
711
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
712
            pixels+=line_size;\
713
            block +=line_size;\
714
            a= LD64(pixels  );\
715
            b= LD64(pixels+1);\
716
            l0=  (a&0x0303030303030303ULL)\
717
               + (b&0x0303030303030303ULL)\
718
               + 0x0101010101010101ULL;\
719
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
722
            pixels+=line_size;\
723
            block +=line_size;\
724
        }\
725
}\
726
\
727
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
728
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
729
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
730
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
731
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
732
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
733
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
734

735
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
736
#else // 64 bit variant
737

    
738
#define PIXOP2(OPNAME, OP) \
739
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
740
    int i;\
741
    for(i=0; i<h; i++){\
742
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
743
        pixels+=line_size;\
744
        block +=line_size;\
745
    }\
746
}\
747
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
748
    int i;\
749
    for(i=0; i<h; i++){\
750
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
751
        pixels+=line_size;\
752
        block +=line_size;\
753
    }\
754
}\
755
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
756
    int i;\
757
    for(i=0; i<h; i++){\
758
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
759
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
760
        pixels+=line_size;\
761
        block +=line_size;\
762
    }\
763
}\
764
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
765
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
766
}\
767
\
768
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
769
                                                int src_stride1, int src_stride2, int h){\
770
    int i;\
771
    for(i=0; i<h; i++){\
772
        uint32_t a,b;\
773
        a= LD32(&src1[i*src_stride1  ]);\
774
        b= LD32(&src2[i*src_stride2  ]);\
775
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
776
        a= LD32(&src1[i*src_stride1+4]);\
777
        b= LD32(&src2[i*src_stride2+4]);\
778
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
779
    }\
780
}\
781
\
782
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
783
                                                int src_stride1, int src_stride2, int h){\
784
    int i;\
785
    for(i=0; i<h; i++){\
786
        uint32_t a,b;\
787
        a= LD32(&src1[i*src_stride1  ]);\
788
        b= LD32(&src2[i*src_stride2  ]);\
789
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
790
        a= LD32(&src1[i*src_stride1+4]);\
791
        b= LD32(&src2[i*src_stride2+4]);\
792
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
793
    }\
794
}\
795
\
796
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
797
                                                int src_stride1, int src_stride2, int h){\
798
    int i;\
799
    for(i=0; i<h; i++){\
800
        uint32_t a,b;\
801
        a= LD32(&src1[i*src_stride1  ]);\
802
        b= LD32(&src2[i*src_stride2  ]);\
803
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
804
    }\
805
}\
806
\
807
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
808
                                                int src_stride1, int src_stride2, int h){\
809
    int i;\
810
    for(i=0; i<h; i++){\
811
        uint32_t a,b;\
812
        a= LD16(&src1[i*src_stride1  ]);\
813
        b= LD16(&src2[i*src_stride2  ]);\
814
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
815
    }\
816
}\
817
\
818
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
819
                                                int src_stride1, int src_stride2, int h){\
820
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
821
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
822
}\
823
\
824
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
825
                                                int src_stride1, int src_stride2, int h){\
826
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
827
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
828
}\
829
\
830
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832
}\
833
\
834
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
836
}\
837
\
838
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840
}\
841
\
842
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
843
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
844
}\
845
\
846
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
847
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
848
    int i;\
849
    for(i=0; i<h; i++){\
850
        uint32_t a, b, c, d, l0, l1, h0, h1;\
851
        a= LD32(&src1[i*src_stride1]);\
852
        b= LD32(&src2[i*src_stride2]);\
853
        c= LD32(&src3[i*src_stride3]);\
854
        d= LD32(&src4[i*src_stride4]);\
855
        l0=  (a&0x03030303UL)\
856
           + (b&0x03030303UL)\
857
           + 0x02020202UL;\
858
        h0= ((a&0xFCFCFCFCUL)>>2)\
859
          + ((b&0xFCFCFCFCUL)>>2);\
860
        l1=  (c&0x03030303UL)\
861
           + (d&0x03030303UL);\
862
        h1= ((c&0xFCFCFCFCUL)>>2)\
863
          + ((d&0xFCFCFCFCUL)>>2);\
864
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
865
        a= LD32(&src1[i*src_stride1+4]);\
866
        b= LD32(&src2[i*src_stride2+4]);\
867
        c= LD32(&src3[i*src_stride3+4]);\
868
        d= LD32(&src4[i*src_stride4+4]);\
869
        l0=  (a&0x03030303UL)\
870
           + (b&0x03030303UL)\
871
           + 0x02020202UL;\
872
        h0= ((a&0xFCFCFCFCUL)>>2)\
873
          + ((b&0xFCFCFCFCUL)>>2);\
874
        l1=  (c&0x03030303UL)\
875
           + (d&0x03030303UL);\
876
        h1= ((c&0xFCFCFCFCUL)>>2)\
877
          + ((d&0xFCFCFCFCUL)>>2);\
878
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
879
    }\
880
}\
881
\
882
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
884
}\
885
\
886
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
888
}\
889
\
890
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
892
}\
893
\
894
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
896
}\
897
\
898
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
900
    int i;\
901
    for(i=0; i<h; i++){\
902
        uint32_t a, b, c, d, l0, l1, h0, h1;\
903
        a= LD32(&src1[i*src_stride1]);\
904
        b= LD32(&src2[i*src_stride2]);\
905
        c= LD32(&src3[i*src_stride3]);\
906
        d= LD32(&src4[i*src_stride4]);\
907
        l0=  (a&0x03030303UL)\
908
           + (b&0x03030303UL)\
909
           + 0x01010101UL;\
910
        h0= ((a&0xFCFCFCFCUL)>>2)\
911
          + ((b&0xFCFCFCFCUL)>>2);\
912
        l1=  (c&0x03030303UL)\
913
           + (d&0x03030303UL);\
914
        h1= ((c&0xFCFCFCFCUL)>>2)\
915
          + ((d&0xFCFCFCFCUL)>>2);\
916
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
917
        a= LD32(&src1[i*src_stride1+4]);\
918
        b= LD32(&src2[i*src_stride2+4]);\
919
        c= LD32(&src3[i*src_stride3+4]);\
920
        d= LD32(&src4[i*src_stride4+4]);\
921
        l0=  (a&0x03030303UL)\
922
           + (b&0x03030303UL)\
923
           + 0x01010101UL;\
924
        h0= ((a&0xFCFCFCFCUL)>>2)\
925
          + ((b&0xFCFCFCFCUL)>>2);\
926
        l1=  (c&0x03030303UL)\
927
           + (d&0x03030303UL);\
928
        h1= ((c&0xFCFCFCFCUL)>>2)\
929
          + ((d&0xFCFCFCFCUL)>>2);\
930
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
931
    }\
932
}\
933
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
934
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
935
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937
}\
938
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
939
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
940
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942
}\
943
\
944
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945
{\
946
        int i, a0, b0, a1, b1;\
947
        a0= pixels[0];\
948
        b0= pixels[1] + 2;\
949
        a0 += b0;\
950
        b0 += pixels[2];\
951
\
952
        pixels+=line_size;\
953
        for(i=0; i<h; i+=2){\
954
            a1= pixels[0];\
955
            b1= pixels[1];\
956
            a1 += b1;\
957
            b1 += pixels[2];\
958
\
959
            block[0]= (a1+a0)>>2; /* FIXME non put */\
960
            block[1]= (b1+b0)>>2;\
961
\
962
            pixels+=line_size;\
963
            block +=line_size;\
964
\
965
            a0= pixels[0];\
966
            b0= pixels[1] + 2;\
967
            a0 += b0;\
968
            b0 += pixels[2];\
969
\
970
            block[0]= (a1+a0)>>2;\
971
            block[1]= (b1+b0)>>2;\
972
            pixels+=line_size;\
973
            block +=line_size;\
974
        }\
975
}\
976
\
977
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
978
{\
979
        int i;\
980
        const uint32_t a= LD32(pixels  );\
981
        const uint32_t b= LD32(pixels+1);\
982
        uint32_t l0=  (a&0x03030303UL)\
983
                    + (b&0x03030303UL)\
984
                    + 0x02020202UL;\
985
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986
                   + ((b&0xFCFCFCFCUL)>>2);\
987
        uint32_t l1,h1;\
988
\
989
        pixels+=line_size;\
990
        for(i=0; i<h; i+=2){\
991
            uint32_t a= LD32(pixels  );\
992
            uint32_t b= LD32(pixels+1);\
993
            l1=  (a&0x03030303UL)\
994
               + (b&0x03030303UL);\
995
            h1= ((a&0xFCFCFCFCUL)>>2)\
996
              + ((b&0xFCFCFCFCUL)>>2);\
997
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998
            pixels+=line_size;\
999
            block +=line_size;\
1000
            a= LD32(pixels  );\
1001
            b= LD32(pixels+1);\
1002
            l0=  (a&0x03030303UL)\
1003
               + (b&0x03030303UL)\
1004
               + 0x02020202UL;\
1005
            h0= ((a&0xFCFCFCFCUL)>>2)\
1006
              + ((b&0xFCFCFCFCUL)>>2);\
1007
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008
            pixels+=line_size;\
1009
            block +=line_size;\
1010
        }\
1011
}\
1012
\
1013
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1014
{\
1015
    int j;\
1016
    for(j=0; j<2; j++){\
1017
        int i;\
1018
        const uint32_t a= LD32(pixels  );\
1019
        const uint32_t b= LD32(pixels+1);\
1020
        uint32_t l0=  (a&0x03030303UL)\
1021
                    + (b&0x03030303UL)\
1022
                    + 0x02020202UL;\
1023
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1024
                   + ((b&0xFCFCFCFCUL)>>2);\
1025
        uint32_t l1,h1;\
1026
\
1027
        pixels+=line_size;\
1028
        for(i=0; i<h; i+=2){\
1029
            uint32_t a= LD32(pixels  );\
1030
            uint32_t b= LD32(pixels+1);\
1031
            l1=  (a&0x03030303UL)\
1032
               + (b&0x03030303UL);\
1033
            h1= ((a&0xFCFCFCFCUL)>>2)\
1034
              + ((b&0xFCFCFCFCUL)>>2);\
1035
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1036
            pixels+=line_size;\
1037
            block +=line_size;\
1038
            a= LD32(pixels  );\
1039
            b= LD32(pixels+1);\
1040
            l0=  (a&0x03030303UL)\
1041
               + (b&0x03030303UL)\
1042
               + 0x02020202UL;\
1043
            h0= ((a&0xFCFCFCFCUL)>>2)\
1044
              + ((b&0xFCFCFCFCUL)>>2);\
1045
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046
            pixels+=line_size;\
1047
            block +=line_size;\
1048
        }\
1049
        pixels+=4-line_size*(h+1);\
1050
        block +=4-line_size*h;\
1051
    }\
1052
}\
1053
\
1054
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1055
{\
1056
    int j;\
1057
    for(j=0; j<2; j++){\
1058
        int i;\
1059
        const uint32_t a= LD32(pixels  );\
1060
        const uint32_t b= LD32(pixels+1);\
1061
        uint32_t l0=  (a&0x03030303UL)\
1062
                    + (b&0x03030303UL)\
1063
                    + 0x01010101UL;\
1064
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1065
                   + ((b&0xFCFCFCFCUL)>>2);\
1066
        uint32_t l1,h1;\
1067
\
1068
        pixels+=line_size;\
1069
        for(i=0; i<h; i+=2){\
1070
            uint32_t a= LD32(pixels  );\
1071
            uint32_t b= LD32(pixels+1);\
1072
            l1=  (a&0x03030303UL)\
1073
               + (b&0x03030303UL);\
1074
            h1= ((a&0xFCFCFCFCUL)>>2)\
1075
              + ((b&0xFCFCFCFCUL)>>2);\
1076
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077
            pixels+=line_size;\
1078
            block +=line_size;\
1079
            a= LD32(pixels  );\
1080
            b= LD32(pixels+1);\
1081
            l0=  (a&0x03030303UL)\
1082
               + (b&0x03030303UL)\
1083
               + 0x01010101UL;\
1084
            h0= ((a&0xFCFCFCFCUL)>>2)\
1085
              + ((b&0xFCFCFCFCUL)>>2);\
1086
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087
            pixels+=line_size;\
1088
            block +=line_size;\
1089
        }\
1090
        pixels+=4-line_size*(h+1);\
1091
        block +=4-line_size*h;\
1092
    }\
1093
}\
1094
\
1095
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1096
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1097
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1098
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1099
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1100
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1101
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1102
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1103

    
1104
#define op_avg(a, b) a = rnd_avg32(a, b)
1105
#endif
1106
#define op_put(a, b) a = b
1107

    
1108
PIXOP2(avg, op_avg)
1109
PIXOP2(put, op_put)
1110
#undef op_avg
1111
#undef op_put
1112

    
1113
#define avg2(a,b) ((a+b+1)>>1)
1114
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115

    
1116
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1118
}
1119

    
1120
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1121
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1122
}
1123

    
1124
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125
{
1126
    const int A=(16-x16)*(16-y16);
1127
    const int B=(   x16)*(16-y16);
1128
    const int C=(16-x16)*(   y16);
1129
    const int D=(   x16)*(   y16);
1130
    int i;
1131

    
1132
    for(i=0; i<h; i++)
1133
    {
1134
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1135
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1136
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1137
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1138
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1139
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1140
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1141
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1142
        dst+= stride;
1143
        src+= stride;
1144
    }
1145
}
1146

    
1147
static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1148
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1149
{
1150
    int y, vx, vy;
1151
    const int s= 1<<shift;
1152

    
1153
    width--;
1154
    height--;
1155

    
1156
    for(y=0; y<h; y++){
1157
        int x;
1158

    
1159
        vx= ox;
1160
        vy= oy;
1161
        for(x=0; x<8; x++){ //XXX FIXME optimize
1162
            int src_x, src_y, frac_x, frac_y, index;
1163

    
1164
            src_x= vx>>16;
1165
            src_y= vy>>16;
1166
            frac_x= src_x&(s-1);
1167
            frac_y= src_y&(s-1);
1168
            src_x>>=shift;
1169
            src_y>>=shift;
1170

    
1171
            if((unsigned)src_x < width){
1172
                if((unsigned)src_y < height){
1173
                    index= src_x + src_y*stride;
1174
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1175
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1176
                                        + (  src[index+stride  ]*(s-frac_x)
1177
                                           + src[index+stride+1]*   frac_x )*   frac_y
1178
                                        + r)>>(shift*2);
1179
                }else{
1180
                    index= src_x + clip(src_y, 0, height)*stride;
1181
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1182
                                          + src[index       +1]*   frac_x )*s
1183
                                        + r)>>(shift*2);
1184
                }
1185
            }else{
1186
                if((unsigned)src_y < height){
1187
                    index= clip(src_x, 0, width) + src_y*stride;
1188
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1189
                                           + src[index+stride  ]*   frac_y )*s
1190
                                        + r)>>(shift*2);
1191
                }else{
1192
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1193
                    dst[y*stride + x]=    src[index         ];
1194
                }
1195
            }
1196

    
1197
            vx+= dxx;
1198
            vy+= dyx;
1199
        }
1200
        ox += dxy;
1201
        oy += dyy;
1202
    }
1203
}
1204

    
1205
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206
    switch(width){
1207
    case 2: put_pixels2_c (dst, src, stride, height); break;
1208
    case 4: put_pixels4_c (dst, src, stride, height); break;
1209
    case 8: put_pixels8_c (dst, src, stride, height); break;
1210
    case 16:put_pixels16_c(dst, src, stride, height); break;
1211
    }
1212
}
1213

    
1214
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215
    int i,j;
1216
    for (i=0; i < height; i++) {
1217
      for (j=0; j < width; j++) {
1218
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1219
      }
1220
      src += stride;
1221
      dst += stride;
1222
    }
1223
}
1224

    
1225
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226
    int i,j;
1227
    for (i=0; i < height; i++) {
1228
      for (j=0; j < width; j++) {
1229
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1230
      }
1231
      src += stride;
1232
      dst += stride;
1233
    }
1234
}
1235

    
1236
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237
    int i,j;
1238
    for (i=0; i < height; i++) {
1239
      for (j=0; j < width; j++) {
1240
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1241
      }
1242
      src += stride;
1243
      dst += stride;
1244
    }
1245
}
1246

    
1247
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248
    int i,j;
1249
    for (i=0; i < height; i++) {
1250
      for (j=0; j < width; j++) {
1251
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1252
      }
1253
      src += stride;
1254
      dst += stride;
1255
    }
1256
}
1257

    
1258
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259
    int i,j;
1260
    for (i=0; i < height; i++) {
1261
      for (j=0; j < width; j++) {
1262
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1263
      }
1264
      src += stride;
1265
      dst += stride;
1266
    }
1267
}
1268

    
1269
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270
    int i,j;
1271
    for (i=0; i < height; i++) {
1272
      for (j=0; j < width; j++) {
1273
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1274
      }
1275
      src += stride;
1276
      dst += stride;
1277
    }
1278
}
1279

    
1280
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281
    int i,j;
1282
    for (i=0; i < height; i++) {
1283
      for (j=0; j < width; j++) {
1284
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1285
      }
1286
      src += stride;
1287
      dst += stride;
1288
    }
1289
}
1290

    
1291
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292
    int i,j;
1293
    for (i=0; i < height; i++) {
1294
      for (j=0; j < width; j++) {
1295
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1296
      }
1297
      src += stride;
1298
      dst += stride;
1299
    }
1300
}
1301

    
1302
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303
    switch(width){
1304
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1305
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1306
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1307
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1308
    }
1309
}
1310

    
1311
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312
    int i,j;
1313
    for (i=0; i < height; i++) {
1314
      for (j=0; j < width; j++) {
1315
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1316
      }
1317
      src += stride;
1318
      dst += stride;
1319
    }
1320
}
1321

    
1322
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323
    int i,j;
1324
    for (i=0; i < height; i++) {
1325
      for (j=0; j < width; j++) {
1326
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1327
      }
1328
      src += stride;
1329
      dst += stride;
1330
    }
1331
}
1332

    
1333
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334
    int i,j;
1335
    for (i=0; i < height; i++) {
1336
      for (j=0; j < width; j++) {
1337
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1338
      }
1339
      src += stride;
1340
      dst += stride;
1341
    }
1342
}
1343

    
1344
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345
    int i,j;
1346
    for (i=0; i < height; i++) {
1347
      for (j=0; j < width; j++) {
1348
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1349
      }
1350
      src += stride;
1351
      dst += stride;
1352
    }
1353
}
1354

    
1355
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356
    int i,j;
1357
    for (i=0; i < height; i++) {
1358
      for (j=0; j < width; j++) {
1359
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1360
      }
1361
      src += stride;
1362
      dst += stride;
1363
    }
1364
}
1365

    
1366
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367
    int i,j;
1368
    for (i=0; i < height; i++) {
1369
      for (j=0; j < width; j++) {
1370
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1371
      }
1372
      src += stride;
1373
      dst += stride;
1374
    }
1375
}
1376

    
1377
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378
    int i,j;
1379
    for (i=0; i < height; i++) {
1380
      for (j=0; j < width; j++) {
1381
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1382
      }
1383
      src += stride;
1384
      dst += stride;
1385
    }
1386
}
1387

    
1388
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389
    int i,j;
1390
    for (i=0; i < height; i++) {
1391
      for (j=0; j < width; j++) {
1392
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1393
      }
1394
      src += stride;
1395
      dst += stride;
1396
    }
1397
}
1398
#if 0
1399
#define TPEL_WIDTH(width)\
1400
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1402
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1404
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1406
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1408
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1410
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1412
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1414
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1416
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1418
#endif
1419

    
1420
#define H264_CHROMA_MC(OPNAME, OP)\
1421
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422
    const int A=(8-x)*(8-y);\
1423
    const int B=(  x)*(8-y);\
1424
    const int C=(8-x)*(  y);\
1425
    const int D=(  x)*(  y);\
1426
    int i;\
1427
    \
1428
    assert(x<8 && y<8 && x>=0 && y>=0);\
1429
\
1430
    for(i=0; i<h; i++)\
1431
    {\
1432
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434
        dst+= stride;\
1435
        src+= stride;\
1436
    }\
1437
}\
1438
\
1439
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1440
    const int A=(8-x)*(8-y);\
1441
    const int B=(  x)*(8-y);\
1442
    const int C=(8-x)*(  y);\
1443
    const int D=(  x)*(  y);\
1444
    int i;\
1445
    \
1446
    assert(x<8 && y<8 && x>=0 && y>=0);\
1447
\
1448
    for(i=0; i<h; i++)\
1449
    {\
1450
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1451
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1453
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1454
        dst+= stride;\
1455
        src+= stride;\
1456
    }\
1457
}\
1458
\
1459
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1460
    const int A=(8-x)*(8-y);\
1461
    const int B=(  x)*(8-y);\
1462
    const int C=(8-x)*(  y);\
1463
    const int D=(  x)*(  y);\
1464
    int i;\
1465
    \
1466
    assert(x<8 && y<8 && x>=0 && y>=0);\
1467
\
1468
    for(i=0; i<h; i++)\
1469
    {\
1470
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1471
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1472
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1473
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1474
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1475
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1476
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1477
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1478
        dst+= stride;\
1479
        src+= stride;\
1480
    }\
1481
}
1482

    
1483
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1484
#define op_put(a, b) a = (((b) + 32)>>6)
1485

    
1486
H264_CHROMA_MC(put_       , op_put)
1487
H264_CHROMA_MC(avg_       , op_avg)
1488
#undef op_avg
1489
#undef op_put
1490

    
1491
static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1492
{
1493
    int i;
1494
    for(i=0; i<h; i++)
1495
    {
1496
        ST16(dst   , LD16(src   ));
1497
        dst+=dstStride;
1498
        src+=srcStride;
1499
    }
1500
}
1501

    
1502
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1503
{
1504
    int i;
1505
    for(i=0; i<h; i++)
1506
    {
1507
        ST32(dst   , LD32(src   ));
1508
        dst+=dstStride;
1509
        src+=srcStride;
1510
    }
1511
}
1512

    
1513
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514
{
1515
    int i;
1516
    for(i=0; i<h; i++)
1517
    {
1518
        ST32(dst   , LD32(src   ));
1519
        ST32(dst+4 , LD32(src+4 ));
1520
        dst+=dstStride;
1521
        src+=srcStride;
1522
    }
1523
}
1524

    
1525
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1526
{
1527
    int i;
1528
    for(i=0; i<h; i++)
1529
    {
1530
        ST32(dst   , LD32(src   ));
1531
        ST32(dst+4 , LD32(src+4 ));
1532
        ST32(dst+8 , LD32(src+8 ));
1533
        ST32(dst+12, LD32(src+12));
1534
        dst+=dstStride;
1535
        src+=srcStride;
1536
    }
1537
}
1538

    
1539
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1540
{
1541
    int i;
1542
    for(i=0; i<h; i++)
1543
    {
1544
        ST32(dst   , LD32(src   ));
1545
        ST32(dst+4 , LD32(src+4 ));
1546
        ST32(dst+8 , LD32(src+8 ));
1547
        ST32(dst+12, LD32(src+12));
1548
        dst[16]= src[16];
1549
        dst+=dstStride;
1550
        src+=srcStride;
1551
    }
1552
}
1553

    
1554
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1555
{
1556
    int i;
1557
    for(i=0; i<h; i++)
1558
    {
1559
        ST32(dst   , LD32(src   ));
1560
        ST32(dst+4 , LD32(src+4 ));
1561
        dst[8]= src[8];
1562
        dst+=dstStride;
1563
        src+=srcStride;
1564
    }
1565
}
1566

    
1567

    
1568
#define QPEL_MC(r, OPNAME, RND, OP) \
1569
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1571
    int i;\
1572
    for(i=0; i<h; i++)\
1573
    {\
1574
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1575
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1576
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1577
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1578
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1579
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1580
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1581
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1582
        dst+=dstStride;\
1583
        src+=srcStride;\
1584
    }\
1585
}\
1586
\
1587
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1588
    const int w=8;\
1589
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1590
    int i;\
1591
    for(i=0; i<w; i++)\
1592
    {\
1593
        const int src0= src[0*srcStride];\
1594
        const int src1= src[1*srcStride];\
1595
        const int src2= src[2*srcStride];\
1596
        const int src3= src[3*srcStride];\
1597
        const int src4= src[4*srcStride];\
1598
        const int src5= src[5*srcStride];\
1599
        const int src6= src[6*srcStride];\
1600
        const int src7= src[7*srcStride];\
1601
        const int src8= src[8*srcStride];\
1602
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1603
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1604
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1605
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1606
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1607
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1608
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1609
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1610
        dst++;\
1611
        src++;\
1612
    }\
1613
}\
1614
\
1615
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1617
    int i;\
1618
    \
1619
    for(i=0; i<h; i++)\
1620
    {\
1621
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1622
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1623
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1624
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1625
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1626
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1627
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1628
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1629
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1630
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1631
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1632
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1633
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1634
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1635
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1636
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1637
        dst+=dstStride;\
1638
        src+=srcStride;\
1639
    }\
1640
}\
1641
\
1642
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1643
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1644
    int i;\
1645
    const int w=16;\
1646
    for(i=0; i<w; i++)\
1647
    {\
1648
        const int src0= src[0*srcStride];\
1649
        const int src1= src[1*srcStride];\
1650
        const int src2= src[2*srcStride];\
1651
        const int src3= src[3*srcStride];\
1652
        const int src4= src[4*srcStride];\
1653
        const int src5= src[5*srcStride];\
1654
        const int src6= src[6*srcStride];\
1655
        const int src7= src[7*srcStride];\
1656
        const int src8= src[8*srcStride];\
1657
        const int src9= src[9*srcStride];\
1658
        const int src10= src[10*srcStride];\
1659
        const int src11= src[11*srcStride];\
1660
        const int src12= src[12*srcStride];\
1661
        const int src13= src[13*srcStride];\
1662
        const int src14= src[14*srcStride];\
1663
        const int src15= src[15*srcStride];\
1664
        const int src16= src[16*srcStride];\
1665
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1666
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1667
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1668
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1669
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1670
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1671
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1672
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1673
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1674
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1675
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1676
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1677
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1678
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1679
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1680
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1681
        dst++;\
1682
        src++;\
1683
    }\
1684
}\
1685
\
1686
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1687
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1688
}\
1689
\
1690
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1691
    uint8_t half[64];\
1692
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1693
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1694
}\
1695
\
1696
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1697
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1698
}\
1699
\
1700
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1701
    uint8_t half[64];\
1702
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1703
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1704
}\
1705
\
1706
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1707
    uint8_t full[16*9];\
1708
    uint8_t half[64];\
1709
    copy_block9(full, src, 16, stride, 9);\
1710
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1711
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1712
}\
1713
\
1714
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1715
    uint8_t full[16*9];\
1716
    copy_block9(full, src, 16, stride, 9);\
1717
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1718
}\
1719
\
1720
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1721
    uint8_t full[16*9];\
1722
    uint8_t half[64];\
1723
    copy_block9(full, src, 16, stride, 9);\
1724
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1725
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1726
}\
1727
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1728
    uint8_t full[16*9];\
1729
    uint8_t halfH[72];\
1730
    uint8_t halfV[64];\
1731
    uint8_t halfHV[64];\
1732
    copy_block9(full, src, 16, stride, 9);\
1733
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1735
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737
}\
1738
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1739
    uint8_t full[16*9];\
1740
    uint8_t halfH[72];\
1741
    uint8_t halfHV[64];\
1742
    copy_block9(full, src, 16, stride, 9);\
1743
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1744
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1745
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1746
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1747
}\
1748
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1749
    uint8_t full[16*9];\
1750
    uint8_t halfH[72];\
1751
    uint8_t halfV[64];\
1752
    uint8_t halfHV[64];\
1753
    copy_block9(full, src, 16, stride, 9);\
1754
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1756
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758
}\
1759
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1760
    uint8_t full[16*9];\
1761
    uint8_t halfH[72];\
1762
    uint8_t halfHV[64];\
1763
    copy_block9(full, src, 16, stride, 9);\
1764
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1766
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768
}\
1769
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770
    uint8_t full[16*9];\
1771
    uint8_t halfH[72];\
1772
    uint8_t halfV[64];\
1773
    uint8_t halfHV[64];\
1774
    copy_block9(full, src, 16, stride, 9);\
1775
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779
}\
1780
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1781
    uint8_t full[16*9];\
1782
    uint8_t halfH[72];\
1783
    uint8_t halfHV[64];\
1784
    copy_block9(full, src, 16, stride, 9);\
1785
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1789
}\
1790
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791
    uint8_t full[16*9];\
1792
    uint8_t halfH[72];\
1793
    uint8_t halfV[64];\
1794
    uint8_t halfHV[64];\
1795
    copy_block9(full, src, 16, stride, 9);\
1796
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1797
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800
}\
1801
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t full[16*9];\
1803
    uint8_t halfH[72];\
1804
    uint8_t halfHV[64];\
1805
    copy_block9(full, src, 16, stride, 9);\
1806
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1810
}\
1811
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t halfH[72];\
1813
    uint8_t halfHV[64];\
1814
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1815
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1816
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1817
}\
1818
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1819
    uint8_t halfH[72];\
1820
    uint8_t halfHV[64];\
1821
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824
}\
1825
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826
    uint8_t full[16*9];\
1827
    uint8_t halfH[72];\
1828
    uint8_t halfV[64];\
1829
    uint8_t halfHV[64];\
1830
    copy_block9(full, src, 16, stride, 9);\
1831
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1833
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1835
}\
1836
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1837
    uint8_t full[16*9];\
1838
    uint8_t halfH[72];\
1839
    copy_block9(full, src, 16, stride, 9);\
1840
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1842
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1843
}\
1844
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1845
    uint8_t full[16*9];\
1846
    uint8_t halfH[72];\
1847
    uint8_t halfV[64];\
1848
    uint8_t halfHV[64];\
1849
    copy_block9(full, src, 16, stride, 9);\
1850
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1851
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1852
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1854
}\
1855
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1856
    uint8_t full[16*9];\
1857
    uint8_t halfH[72];\
1858
    copy_block9(full, src, 16, stride, 9);\
1859
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1861
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1862
}\
1863
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1864
    uint8_t halfH[72];\
1865
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1866
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867
}\
1868
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1869
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1870
}\
1871
\
1872
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1873
    uint8_t half[256];\
1874
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1875
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1876
}\
1877
\
1878
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1880
}\
1881
\
1882
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1883
    uint8_t half[256];\
1884
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1885
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1886
}\
1887
\
1888
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1889
    uint8_t full[24*17];\
1890
    uint8_t half[256];\
1891
    copy_block17(full, src, 24, stride, 17);\
1892
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1893
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1894
}\
1895
\
1896
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1897
    uint8_t full[24*17];\
1898
    copy_block17(full, src, 24, stride, 17);\
1899
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1900
}\
1901
\
1902
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1903
    uint8_t full[24*17];\
1904
    uint8_t half[256];\
1905
    copy_block17(full, src, 24, stride, 17);\
1906
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1907
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1908
}\
1909
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1910
    uint8_t full[24*17];\
1911
    uint8_t halfH[272];\
1912
    uint8_t halfV[256];\
1913
    uint8_t halfHV[256];\
1914
    copy_block17(full, src, 24, stride, 17);\
1915
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1917
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919
}\
1920
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1921
    uint8_t full[24*17];\
1922
    uint8_t halfH[272];\
1923
    uint8_t halfHV[256];\
1924
    copy_block17(full, src, 24, stride, 17);\
1925
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1926
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1927
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1928
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1929
}\
1930
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1931
    uint8_t full[24*17];\
1932
    uint8_t halfH[272];\
1933
    uint8_t halfV[256];\
1934
    uint8_t halfHV[256];\
1935
    copy_block17(full, src, 24, stride, 17);\
1936
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1938
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940
}\
1941
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1942
    uint8_t full[24*17];\
1943
    uint8_t halfH[272];\
1944
    uint8_t halfHV[256];\
1945
    copy_block17(full, src, 24, stride, 17);\
1946
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1948
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950
}\
1951
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1952
    uint8_t full[24*17];\
1953
    uint8_t halfH[272];\
1954
    uint8_t halfV[256];\
1955
    uint8_t halfHV[256];\
1956
    copy_block17(full, src, 24, stride, 17);\
1957
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1958
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1959
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1960
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1961
}\
1962
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1963
    uint8_t full[24*17];\
1964
    uint8_t halfH[272];\
1965
    uint8_t halfHV[256];\
1966
    copy_block17(full, src, 24, stride, 17);\
1967
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1969
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1971
}\
1972
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1973
    uint8_t full[24*17];\
1974
    uint8_t halfH[272];\
1975
    uint8_t halfV[256];\
1976
    uint8_t halfHV[256];\
1977
    copy_block17(full, src, 24, stride, 17);\
1978
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1979
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1980
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1981
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1982
}\
1983
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t full[24*17];\
1985
    uint8_t halfH[272];\
1986
    uint8_t halfHV[256];\
1987
    copy_block17(full, src, 24, stride, 17);\
1988
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1990
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1992
}\
1993
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t halfH[272];\
1995
    uint8_t halfHV[256];\
1996
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1997
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999
}\
2000
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t halfH[272];\
2002
    uint8_t halfHV[256];\
2003
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006
}\
2007
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008
    uint8_t full[24*17];\
2009
    uint8_t halfH[272];\
2010
    uint8_t halfV[256];\
2011
    uint8_t halfHV[256];\
2012
    copy_block17(full, src, 24, stride, 17);\
2013
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2015
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2017
}\
2018
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2019
    uint8_t full[24*17];\
2020
    uint8_t halfH[272];\
2021
    copy_block17(full, src, 24, stride, 17);\
2022
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2024
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2025
}\
2026
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2027
    uint8_t full[24*17];\
2028
    uint8_t halfH[272];\
2029
    uint8_t halfV[256];\
2030
    uint8_t halfHV[256];\
2031
    copy_block17(full, src, 24, stride, 17);\
2032
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2033
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2034
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2035
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2036
}\
2037
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2038
    uint8_t full[24*17];\
2039
    uint8_t halfH[272];\
2040
    copy_block17(full, src, 24, stride, 17);\
2041
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2043
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2044
}\
2045
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2046
    uint8_t halfH[272];\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2048
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049
}
2050

    
2051
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2052
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2053
#define op_put(a, b) a = cm[((b) + 16)>>5]
2054
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2055

    
2056
QPEL_MC(0, put_       , _       , op_put)
2057
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2058
QPEL_MC(0, avg_       , _       , op_avg)
2059
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2060
#undef op_avg
2061
#undef op_avg_no_rnd
2062
#undef op_put
2063
#undef op_put_no_rnd
2064

    
2065
#if 1
2066
#define H264_LOWPASS(OPNAME, OP, OP2) \
2067
static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2068
    const int h=2;\
2069
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2070
    int i;\
2071
    for(i=0; i<h; i++)\
2072
    {\
2073
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2074
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2075
        dst+=dstStride;\
2076
        src+=srcStride;\
2077
    }\
2078
}\
2079
\
2080
static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2081
    const int w=2;\
2082
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2083
    int i;\
2084
    for(i=0; i<w; i++)\
2085
    {\
2086
        const int srcB= src[-2*srcStride];\
2087
        const int srcA= src[-1*srcStride];\
2088
        const int src0= src[0 *srcStride];\
2089
        const int src1= src[1 *srcStride];\
2090
        const int src2= src[2 *srcStride];\
2091
        const int src3= src[3 *srcStride];\
2092
        const int src4= src[4 *srcStride];\
2093
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2094
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2095
        dst++;\
2096
        src++;\
2097
    }\
2098
}\
2099
\
2100
static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2101
    const int h=2;\
2102
    const int w=2;\
2103
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2104
    int i;\
2105
    src -= 2*srcStride;\
2106
    for(i=0; i<h+5; i++)\
2107
    {\
2108
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2109
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2110
        tmp+=tmpStride;\
2111
        src+=srcStride;\
2112
    }\
2113
    tmp -= tmpStride*(h+5-2);\
2114
    for(i=0; i<w; i++)\
2115
    {\
2116
        const int tmpB= tmp[-2*tmpStride];\
2117
        const int tmpA= tmp[-1*tmpStride];\
2118
        const int tmp0= tmp[0 *tmpStride];\
2119
        const int tmp1= tmp[1 *tmpStride];\
2120
        const int tmp2= tmp[2 *tmpStride];\
2121
        const int tmp3= tmp[3 *tmpStride];\
2122
        const int tmp4= tmp[4 *tmpStride];\
2123
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2124
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2125
        dst++;\
2126
        tmp++;\
2127
    }\
2128
}\
2129
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2130
    const int h=4;\
2131
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2132
    int i;\
2133
    for(i=0; i<h; i++)\
2134
    {\
2135
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2136
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2137
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2138
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2139
        dst+=dstStride;\
2140
        src+=srcStride;\
2141
    }\
2142
}\
2143
\
2144
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145
    const int w=4;\
2146
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2147
    int i;\
2148
    for(i=0; i<w; i++)\
2149
    {\
2150
        const int srcB= src[-2*srcStride];\
2151
        const int srcA= src[-1*srcStride];\
2152
        const int src0= src[0 *srcStride];\
2153
        const int src1= src[1 *srcStride];\
2154
        const int src2= src[2 *srcStride];\
2155
        const int src3= src[3 *srcStride];\
2156
        const int src4= src[4 *srcStride];\
2157
        const int src5= src[5 *srcStride];\
2158
        const int src6= src[6 *srcStride];\
2159
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2160
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2161
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2162
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2163
        dst++;\
2164
        src++;\
2165
    }\
2166
}\
2167
\
2168
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2169
    const int h=4;\
2170
    const int w=4;\
2171
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172
    int i;\
2173
    src -= 2*srcStride;\
2174
    for(i=0; i<h+5; i++)\
2175
    {\
2176
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2177
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2178
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2179
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2180
        tmp+=tmpStride;\
2181
        src+=srcStride;\
2182
    }\
2183
    tmp -= tmpStride*(h+5-2);\
2184
    for(i=0; i<w; i++)\
2185
    {\
2186
        const int tmpB= tmp[-2*tmpStride];\
2187
        const int tmpA= tmp[-1*tmpStride];\
2188
        const int tmp0= tmp[0 *tmpStride];\
2189
        const int tmp1= tmp[1 *tmpStride];\
2190
        const int tmp2= tmp[2 *tmpStride];\
2191
        const int tmp3= tmp[3 *tmpStride];\
2192
        const int tmp4= tmp[4 *tmpStride];\
2193
        const int tmp5= tmp[5 *tmpStride];\
2194
        const int tmp6= tmp[6 *tmpStride];\
2195
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2196
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2197
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2198
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2199
        dst++;\
2200
        tmp++;\
2201
    }\
2202
}\
2203
\
2204
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205
    const int h=8;\
2206
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2207
    int i;\
2208
    for(i=0; i<h; i++)\
2209
    {\
2210
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2211
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2212
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2213
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2214
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2215
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2216
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2217
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2218
        dst+=dstStride;\
2219
        src+=srcStride;\
2220
    }\
2221
}\
2222
\
2223
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224
    const int w=8;\
2225
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2226
    int i;\
2227
    for(i=0; i<w; i++)\
2228
    {\
2229
        const int srcB= src[-2*srcStride];\
2230
        const int srcA= src[-1*srcStride];\
2231
        const int src0= src[0 *srcStride];\
2232
        const int src1= src[1 *srcStride];\
2233
        const int src2= src[2 *srcStride];\
2234
        const int src3= src[3 *srcStride];\
2235
        const int src4= src[4 *srcStride];\
2236
        const int src5= src[5 *srcStride];\
2237
        const int src6= src[6 *srcStride];\
2238
        const int src7= src[7 *srcStride];\
2239
        const int src8= src[8 *srcStride];\
2240
        const int src9= src[9 *srcStride];\
2241
        const int src10=src[10*srcStride];\
2242
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2247
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2248
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2249
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2250
        dst++;\
2251
        src++;\
2252
    }\
2253
}\
2254
\
2255
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2256
    const int h=8;\
2257
    const int w=8;\
2258
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2259
    int i;\
2260
    src -= 2*srcStride;\
2261
    for(i=0; i<h+5; i++)\
2262
    {\
2263
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2264
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2265
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2266
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2267
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2268
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2269
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2270
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2271
        tmp+=tmpStride;\
2272
        src+=srcStride;\
2273
    }\
2274
    tmp -= tmpStride*(h+5-2);\
2275
    for(i=0; i<w; i++)\
2276
    {\
2277
        const int tmpB= tmp[-2*tmpStride];\
2278
        const int tmpA= tmp[-1*tmpStride];\
2279
        const int tmp0= tmp[0 *tmpStride];\
2280
        const int tmp1= tmp[1 *tmpStride];\
2281
        const int tmp2= tmp[2 *tmpStride];\
2282
        const int tmp3= tmp[3 *tmpStride];\
2283
        const int tmp4= tmp[4 *tmpStride];\
2284
        const int tmp5= tmp[5 *tmpStride];\
2285
        const int tmp6= tmp[6 *tmpStride];\
2286
        const int tmp7= tmp[7 *tmpStride];\
2287
        const int tmp8= tmp[8 *tmpStride];\
2288
        const int tmp9= tmp[9 *tmpStride];\
2289
        const int tmp10=tmp[10*tmpStride];\
2290
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2291
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2292
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2293
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2294
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2295
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2296
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2297
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2298
        dst++;\
2299
        tmp++;\
2300
    }\
2301
}\
2302
\
2303
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2304
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2305
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306
    src += 8*srcStride;\
2307
    dst += 8*dstStride;\
2308
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2309
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310
}\
2311
\
2312
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2313
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2314
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315
    src += 8*srcStride;\
2316
    dst += 8*dstStride;\
2317
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2318
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319
}\
2320
\
2321
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2322
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2323
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324
    src += 8*srcStride;\
2325
    dst += 8*dstStride;\
2326
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2327
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328
}\
2329

    
2330
#define H264_MC(OPNAME, SIZE) \
2331
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2332
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2333
}\
2334
\
2335
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2336
    uint8_t half[SIZE*SIZE];\
2337
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2338
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2339
}\
2340
\
2341
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2342
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2343
}\
2344
\
2345
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2346
    uint8_t half[SIZE*SIZE];\
2347
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2348
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2349
}\
2350
\
2351
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2352
    uint8_t full[SIZE*(SIZE+5)];\
2353
    uint8_t * const full_mid= full + SIZE*2;\
2354
    uint8_t half[SIZE*SIZE];\
2355
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2356
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2357
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2358
}\
2359
\
2360
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2361
    uint8_t full[SIZE*(SIZE+5)];\
2362
    uint8_t * const full_mid= full + SIZE*2;\
2363
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2364
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2365
}\
2366
\
2367
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2368
    uint8_t full[SIZE*(SIZE+5)];\
2369
    uint8_t * const full_mid= full + SIZE*2;\
2370
    uint8_t half[SIZE*SIZE];\
2371
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2372
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2373
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2374
}\
2375
\
2376
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2377
    uint8_t full[SIZE*(SIZE+5)];\
2378
    uint8_t * const full_mid= full + SIZE*2;\
2379
    uint8_t halfH[SIZE*SIZE];\
2380
    uint8_t halfV[SIZE*SIZE];\
2381
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2382
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2383
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2384
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2385
}\
2386
\
2387
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2388
    uint8_t full[SIZE*(SIZE+5)];\
2389
    uint8_t * const full_mid= full + SIZE*2;\
2390
    uint8_t halfH[SIZE*SIZE];\
2391
    uint8_t halfV[SIZE*SIZE];\
2392
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2393
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2394
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2395
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2396
}\
2397
\
2398
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2399
    uint8_t full[SIZE*(SIZE+5)];\
2400
    uint8_t * const full_mid= full + SIZE*2;\
2401
    uint8_t halfH[SIZE*SIZE];\
2402
    uint8_t halfV[SIZE*SIZE];\
2403
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2404
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2405
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2406
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2407
}\
2408
\
2409
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2410
    uint8_t full[SIZE*(SIZE+5)];\
2411
    uint8_t * const full_mid= full + SIZE*2;\
2412
    uint8_t halfH[SIZE*SIZE];\
2413
    uint8_t halfV[SIZE*SIZE];\
2414
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2415
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2416
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2417
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2418
}\
2419
\
2420
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2421
    int16_t tmp[SIZE*(SIZE+5)];\
2422
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2423
}\
2424
\
2425
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2426
    int16_t tmp[SIZE*(SIZE+5)];\
2427
    uint8_t halfH[SIZE*SIZE];\
2428
    uint8_t halfHV[SIZE*SIZE];\
2429
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2430
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2431
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2432
}\
2433
\
2434
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2435
    int16_t tmp[SIZE*(SIZE+5)];\
2436
    uint8_t halfH[SIZE*SIZE];\
2437
    uint8_t halfHV[SIZE*SIZE];\
2438
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2440
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2441
}\
2442
\
2443
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2444
    uint8_t full[SIZE*(SIZE+5)];\
2445
    uint8_t * const full_mid= full + SIZE*2;\
2446
    int16_t tmp[SIZE*(SIZE+5)];\
2447
    uint8_t halfV[SIZE*SIZE];\
2448
    uint8_t halfHV[SIZE*SIZE];\
2449
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2450
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2451
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2452
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2453
}\
2454
\
2455
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2456
    uint8_t full[SIZE*(SIZE+5)];\
2457
    uint8_t * const full_mid= full + SIZE*2;\
2458
    int16_t tmp[SIZE*(SIZE+5)];\
2459
    uint8_t halfV[SIZE*SIZE];\
2460
    uint8_t halfHV[SIZE*SIZE];\
2461
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2462
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2464
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2465
}\
2466

    
2467
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2468
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2469
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2470
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2471
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2472

    
2473
H264_LOWPASS(put_       , op_put, op2_put)
2474
H264_LOWPASS(avg_       , op_avg, op2_avg)
2475
H264_MC(put_, 2)
2476
H264_MC(put_, 4)
2477
H264_MC(put_, 8)
2478
H264_MC(put_, 16)
2479
H264_MC(avg_, 4)
2480
H264_MC(avg_, 8)
2481
H264_MC(avg_, 16)
2482

    
2483
#undef op_avg
2484
#undef op_put
2485
#undef op2_avg
2486
#undef op2_put
2487
#endif
2488

    
2489
#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2490
#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2491
#define H264_WEIGHT(W,H) \
2492
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2493
    int y; \
2494
    offset <<= log2_denom; \
2495
    if(log2_denom) offset += 1<<(log2_denom-1); \
2496
    for(y=0; y<H; y++, block += stride){ \
2497
        op_scale1(0); \
2498
        op_scale1(1); \
2499
        if(W==2) continue; \
2500
        op_scale1(2); \
2501
        op_scale1(3); \
2502
        if(W==4) continue; \
2503
        op_scale1(4); \
2504
        op_scale1(5); \
2505
        op_scale1(6); \
2506
        op_scale1(7); \
2507
        if(W==8) continue; \
2508
        op_scale1(8); \
2509
        op_scale1(9); \
2510
        op_scale1(10); \
2511
        op_scale1(11); \
2512
        op_scale1(12); \
2513
        op_scale1(13); \
2514
        op_scale1(14); \
2515
        op_scale1(15); \
2516
    } \
2517
} \
2518
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2519
    int y; \
2520
    offset = ((offset + 1) | 1) << log2_denom; \
2521
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2522
        op_scale2(0); \
2523
        op_scale2(1); \
2524
        if(W==2) continue; \
2525
        op_scale2(2); \
2526
        op_scale2(3); \
2527
        if(W==4) continue; \
2528
        op_scale2(4); \
2529
        op_scale2(5); \
2530
        op_scale2(6); \
2531
        op_scale2(7); \
2532
        if(W==8) continue; \
2533
        op_scale2(8); \
2534
        op_scale2(9); \
2535
        op_scale2(10); \
2536
        op_scale2(11); \
2537
        op_scale2(12); \
2538
        op_scale2(13); \
2539
        op_scale2(14); \
2540
        op_scale2(15); \
2541
    } \
2542
}
2543

    
2544
H264_WEIGHT(16,16)
2545
H264_WEIGHT(16,8)
2546
H264_WEIGHT(8,16)
2547
H264_WEIGHT(8,8)
2548
H264_WEIGHT(8,4)
2549
H264_WEIGHT(4,8)
2550
H264_WEIGHT(4,4)
2551
H264_WEIGHT(4,2)
2552
H264_WEIGHT(2,4)
2553
H264_WEIGHT(2,2)
2554

    
2555
#undef op_scale1
2556
#undef op_scale2
2557
#undef H264_WEIGHT
2558

    
2559
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2560
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2561
    int i;
2562

    
2563
    for(i=0; i<h; i++){
2564
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2565
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2566
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2567
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2568
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2569
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2570
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2571
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2572
        dst+=dstStride;
2573
        src+=srcStride;
2574
    }
2575
}
2576

    
2577
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2578
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2579
    int i;
2580

    
2581
    for(i=0; i<w; i++){
2582
        const int src_1= src[ -srcStride];
2583
        const int src0 = src[0          ];
2584
        const int src1 = src[  srcStride];
2585
        const int src2 = src[2*srcStride];
2586
        const int src3 = src[3*srcStride];
2587
        const int src4 = src[4*srcStride];
2588
        const int src5 = src[5*srcStride];
2589
        const int src6 = src[6*srcStride];
2590
        const int src7 = src[7*srcStride];
2591
        const int src8 = src[8*srcStride];
2592
        const int src9 = src[9*srcStride];
2593
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2594
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2595
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2596
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2597
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2598
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2599
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2600
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2601
        src++;
2602
        dst++;
2603
    }
2604
}
2605

    
2606
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2607
    put_pixels8_c(dst, src, stride, 8);
2608
}
2609

    
2610
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611
    uint8_t half[64];
2612
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2614
}
2615

    
2616
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2618
}
2619

    
2620
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621
    uint8_t half[64];
2622
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2624
}
2625

    
2626
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2628
}
2629

    
2630
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631
    uint8_t halfH[88];
2632
    uint8_t halfV[64];
2633
    uint8_t halfHV[64];
2634
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638
}
2639
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640
    uint8_t halfH[88];
2641
    uint8_t halfV[64];
2642
    uint8_t halfHV[64];
2643
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647
}
2648
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649
    uint8_t halfH[88];
2650
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2652
}
2653

    
2654
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655
    int x;
2656
    const int strength= ff_h263_loop_filter_strength[qscale];
2657

    
2658
    for(x=0; x<8; x++){
2659
        int d1, d2, ad1;
2660
        int p0= src[x-2*stride];
2661
        int p1= src[x-1*stride];
2662
        int p2= src[x+0*stride];
2663
        int p3= src[x+1*stride];
2664
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2665

    
2666
        if     (d<-2*strength) d1= 0;
2667
        else if(d<-  strength) d1=-2*strength - d;
2668
        else if(d<   strength) d1= d;
2669
        else if(d< 2*strength) d1= 2*strength - d;
2670
        else                   d1= 0;
2671

    
2672
        p1 += d1;
2673
        p2 -= d1;
2674
        if(p1&256) p1= ~(p1>>31);
2675
        if(p2&256) p2= ~(p2>>31);
2676

    
2677
        src[x-1*stride] = p1;
2678
        src[x+0*stride] = p2;
2679

    
2680
        ad1= ABS(d1)>>1;
2681

    
2682
        d2= clip((p0-p3)/4, -ad1, ad1);
2683

    
2684
        src[x-2*stride] = p0 - d2;
2685
        src[x+  stride] = p3 + d2;
2686
    }
2687
}
2688

    
2689
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2690
    int y;
2691
    const int strength= ff_h263_loop_filter_strength[qscale];
2692

    
2693
    for(y=0; y<8; y++){
2694
        int d1, d2, ad1;
2695
        int p0= src[y*stride-2];
2696
        int p1= src[y*stride-1];
2697
        int p2= src[y*stride+0];
2698
        int p3= src[y*stride+1];
2699
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700

    
2701
        if     (d<-2*strength) d1= 0;
2702
        else if(d<-  strength) d1=-2*strength - d;
2703
        else if(d<   strength) d1= d;
2704
        else if(d< 2*strength) d1= 2*strength - d;
2705
        else                   d1= 0;
2706

    
2707
        p1 += d1;
2708
        p2 -= d1;
2709
        if(p1&256) p1= ~(p1>>31);
2710
        if(p2&256) p2= ~(p2>>31);
2711

    
2712
        src[y*stride-1] = p1;
2713
        src[y*stride+0] = p2;
2714

    
2715
        ad1= ABS(d1)>>1;
2716

    
2717
        d2= clip((p0-p3)/4, -ad1, ad1);
2718

    
2719
        src[y*stride-2] = p0 - d2;
2720
        src[y*stride+1] = p3 + d2;
2721
    }
2722
}
2723

    
2724
static void h261_loop_filter_c(uint8_t *src, int stride){
2725
    int x,y,xy,yz;
2726
    int temp[64];
2727

    
2728
    for(x=0; x<8; x++){
2729
        temp[x      ] = 4*src[x           ];
2730
        temp[x + 7*8] = 4*src[x + 7*stride];
2731
    }
2732
    for(y=1; y<7; y++){
2733
        for(x=0; x<8; x++){
2734
            xy = y * stride + x;
2735
            yz = y * 8 + x;
2736
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2737
        }
2738
    }
2739

    
2740
    for(y=0; y<8; y++){
2741
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2742
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2743
        for(x=1; x<7; x++){
2744
            xy = y * stride + x;
2745
            yz = y * 8 + x;
2746
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2747
        }
2748
    }
2749
}
2750

    
2751
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2752
{
2753
    int i, d;
2754
    for( i = 0; i < 4; i++ ) {
2755
        if( tc0[i] < 0 ) {
2756
            pix += 4*ystride;
2757
            continue;
2758
        }
2759
        for( d = 0; d < 4; d++ ) {
2760
            const int p0 = pix[-1*xstride];
2761
            const int p1 = pix[-2*xstride];
2762
            const int p2 = pix[-3*xstride];
2763
            const int q0 = pix[0];
2764
            const int q1 = pix[1*xstride];
2765
            const int q2 = pix[2*xstride];
2766

    
2767
            if( ABS( p0 - q0 ) < alpha &&
2768
                ABS( p1 - p0 ) < beta &&
2769
                ABS( q1 - q0 ) < beta ) {
2770

    
2771
                int tc = tc0[i];
2772
                int i_delta;
2773

    
2774
                if( ABS( p2 - p0 ) < beta ) {
2775
                    pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2776
                    tc++;
2777
                }
2778
                if( ABS( q2 - q0 ) < beta ) {
2779
                    pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2780
                    tc++;
2781
                }
2782

    
2783
                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2784
                pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2785
                pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2786
            }
2787
            pix += ystride;
2788
        }
2789
    }
2790
}
2791
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2792
{
2793
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2794
}
2795
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2796
{
2797
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2798
}
2799

    
2800
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2801
{
2802
    int i, d;
2803
    for( i = 0; i < 4; i++ ) {
2804
        const int tc = tc0[i];
2805
        if( tc <= 0 ) {
2806
            pix += 2*ystride;
2807
            continue;
2808
        }
2809
        for( d = 0; d < 2; d++ ) {
2810
            const int p0 = pix[-1*xstride];
2811
            const int p1 = pix[-2*xstride];
2812
            const int q0 = pix[0];
2813
            const int q1 = pix[1*xstride];
2814

    
2815
            if( ABS( p0 - q0 ) < alpha &&
2816
                ABS( p1 - p0 ) < beta &&
2817
                ABS( q1 - q0 ) < beta ) {
2818

    
2819
                int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2820

    
2821
                pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2822
                pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2823
            }
2824
            pix += ystride;
2825
        }
2826
    }
2827
}
2828
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2829
{
2830
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2831
}
2832
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833
{
2834
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2835
}
2836

    
2837
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2838
{
2839
    int d;
2840
    for( d = 0; d < 8; d++ ) {
2841
        const int p0 = pix[-1*xstride];
2842
        const int p1 = pix[-2*xstride];
2843
        const int q0 = pix[0];
2844
        const int q1 = pix[1*xstride];
2845

    
2846
        if( ABS( p0 - q0 ) < alpha &&
2847
            ABS( p1 - p0 ) < beta &&
2848
            ABS( q1 - q0 ) < beta ) {
2849

    
2850
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2851
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2852
        }
2853
        pix += ystride;
2854
    }
2855
}
2856
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857
{
2858
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2859
}
2860
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2861
{
2862
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2863
}
2864

    
2865
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866
{
2867
    int s, i;
2868

    
2869
    s = 0;
2870
    for(i=0;i<h;i++) {
2871
        s += abs(pix1[0] - pix2[0]);
2872
        s += abs(pix1[1] - pix2[1]);
2873
        s += abs(pix1[2] - pix2[2]);
2874
        s += abs(pix1[3] - pix2[3]);
2875
        s += abs(pix1[4] - pix2[4]);
2876
        s += abs(pix1[5] - pix2[5]);
2877
        s += abs(pix1[6] - pix2[6]);
2878
        s += abs(pix1[7] - pix2[7]);
2879
        s += abs(pix1[8] - pix2[8]);
2880
        s += abs(pix1[9] - pix2[9]);
2881
        s += abs(pix1[10] - pix2[10]);
2882
        s += abs(pix1[11] - pix2[11]);
2883
        s += abs(pix1[12] - pix2[12]);
2884
        s += abs(pix1[13] - pix2[13]);
2885
        s += abs(pix1[14] - pix2[14]);
2886
        s += abs(pix1[15] - pix2[15]);
2887
        pix1 += line_size;
2888
        pix2 += line_size;
2889
    }
2890
    return s;
2891
}
2892

    
2893
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894
{
2895
    int s, i;
2896

    
2897
    s = 0;
2898
    for(i=0;i<h;i++) {
2899
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2900
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2901
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2902
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2903
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2904
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2905
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2906
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2907
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2908
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2909
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2910
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2911
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2912
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2913
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2914
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2915
        pix1 += line_size;
2916
        pix2 += line_size;
2917
    }
2918
    return s;
2919
}
2920

    
2921
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2922
{
2923
    int s, i;
2924
    uint8_t *pix3 = pix2 + line_size;
2925

    
2926
    s = 0;
2927
    for(i=0;i<h;i++) {
2928
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2929
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2930
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2931
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2932
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2933
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2934
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2935
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2936
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2937
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2938
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2939
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2940
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2941
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2942
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2943
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2944
        pix1 += line_size;
2945
        pix2 += line_size;
2946
        pix3 += line_size;
2947
    }
2948
    return s;
2949
}
2950

    
2951
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2952
{
2953
    int s, i;
2954
    uint8_t *pix3 = pix2 + line_size;
2955

    
2956
    s = 0;
2957
    for(i=0;i<h;i++) {
2958
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2959
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2960
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2961
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2962
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2963
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2964
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2965
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2966
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2967
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2968
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2969
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2970
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2971
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2972
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2973
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2974
        pix1 += line_size;
2975
        pix2 += line_size;
2976
        pix3 += line_size;
2977
    }
2978
    return s;
2979
}
2980

    
2981
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2982
{
2983
    int s, i;
2984

    
2985
    s = 0;
2986
    for(i=0;i<h;i++) {
2987
        s += abs(pix1[0] - pix2[0]);
2988
        s += abs(pix1[1] - pix2[1]);
2989
        s += abs(pix1[2] - pix2[2]);
2990
        s += abs(pix1[3] - pix2[3]);
2991
        s += abs(pix1[4] - pix2[4]);
2992
        s += abs(pix1[5] - pix2[5]);
2993
        s += abs(pix1[6] - pix2[6]);
2994
        s += abs(pix1[7] - pix2[7]);
2995
        pix1 += line_size;
2996
        pix2 += line_size;
2997
    }
2998
    return s;
2999
}
3000

    
3001
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3002
{
3003
    int s, i;
3004

    
3005
    s = 0;
3006
    for(i=0;i<h;i++) {
3007
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3008
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3009
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3010
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3011
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3012
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3013
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3014
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3015
        pix1 += line_size;
3016
        pix2 += line_size;
3017
    }
3018
    return s;
3019
}
3020

    
3021
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3022
{
3023
    int s, i;
3024
    uint8_t *pix3 = pix2 + line_size;
3025

    
3026
    s = 0;
3027
    for(i=0;i<h;i++) {
3028
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3029
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3030
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3031
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3032
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3033
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3034
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3035
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3036
        pix1 += line_size;
3037
        pix2 += line_size;
3038
        pix3 += line_size;
3039
    }
3040
    return s;
3041
}
3042

    
3043
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3044
{
3045
    int s, i;
3046
    uint8_t *pix3 = pix2 + line_size;
3047

    
3048
    s = 0;
3049
    for(i=0;i<h;i++) {
3050
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3051
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3052
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3053
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3054
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3055
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3056
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3057
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3058
        pix1 += line_size;
3059
        pix2 += line_size;
3060
        pix3 += line_size;
3061
    }
3062
    return s;
3063
}
3064

    
3065
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3066
    MpegEncContext *c = v;
3067
    int score1=0;
3068
    int score2=0;
3069
    int x,y;
3070

    
3071
    for(y=0; y<h; y++){
3072
        for(x=0; x<16; x++){
3073
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3074
        }
3075
        if(y+1<h){
3076
            for(x=0; x<15; x++){
3077
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3078
                             - s1[x+1] + s1[x+1+stride])
3079
                        -ABS(  s2[x  ] - s2[x  +stride]
3080
                             - s2[x+1] + s2[x+1+stride]);
3081
            }
3082
        }
3083
        s1+= stride;
3084
        s2+= stride;
3085
    }
3086

    
3087
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3088
    else  return score1 + ABS(score2)*8;
3089
}
3090

    
3091
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092
    MpegEncContext *c = v;
3093
    int score1=0;
3094
    int score2=0;
3095
    int x,y;
3096

    
3097
    for(y=0; y<h; y++){
3098
        for(x=0; x<8; x++){
3099
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3100
        }
3101
        if(y+1<h){
3102
            for(x=0; x<7; x++){
3103
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3104
                             - s1[x+1] + s1[x+1+stride])
3105
                        -ABS(  s2[x  ] - s2[x  +stride]
3106
                             - s2[x+1] + s2[x+1+stride]);
3107
            }
3108
        }
3109
        s1+= stride;
3110
        s2+= stride;
3111
    }
3112

    
3113
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114
    else  return score1 + ABS(score2)*8;
3115
}
3116

    
3117
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3118
    int i;
3119
    unsigned int sum=0;
3120

    
3121
    for(i=0; i<8*8; i++){
3122
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3123
        int w= weight[i];
3124
        b>>= RECON_SHIFT;
3125
        assert(-512<b && b<512);
3126

    
3127
        sum += (w*b)*(w*b)>>4;
3128
    }
3129
    return sum>>2;
3130
}
3131

    
3132
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3133
    int i;
3134

    
3135
    for(i=0; i<8*8; i++){
3136
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3137
    }
3138
}
3139

    
3140
/**
3141
 * permutes an 8x8 block.
3142
 * @param block the block which will be permuted according to the given permutation vector
3143
 * @param permutation the permutation vector
3144
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3145
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3146
 *                  (inverse) permutated to scantable order!
3147
 */
3148
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3149
{
3150
    int i;
3151
    DCTELEM temp[64];
3152

    
3153
    if(last<=0) return;
3154
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3155

    
3156
    for(i=0; i<=last; i++){
3157
        const int j= scantable[i];
3158
        temp[j]= block[j];
3159
        block[j]=0;
3160
    }
3161

    
3162
    for(i=0; i<=last; i++){
3163
        const int j= scantable[i];
3164
        const int perm_j= permutation[j];
3165
        block[perm_j]= temp[j];
3166
    }
3167
}
3168

    
3169
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3170
    return 0;
3171
}
3172

    
3173
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3174
    int i;
3175

    
3176
    memset(cmp, 0, sizeof(void*)*5);
3177

    
3178
    for(i=0; i<5; i++){
3179
        switch(type&0xFF){
3180
        case FF_CMP_SAD:
3181
            cmp[i]= c->sad[i];
3182
            break;
3183
        case FF_CMP_SATD:
3184
            cmp[i]= c->hadamard8_diff[i];
3185
            break;
3186
        case FF_CMP_SSE:
3187
            cmp[i]= c->sse[i];
3188
            break;
3189
        case FF_CMP_DCT:
3190
            cmp[i]= c->dct_sad[i];
3191
            break;
3192
        case FF_CMP_DCT264:
3193
            cmp[i]= c->dct264_sad[i];
3194
            break;
3195
        case FF_CMP_DCTMAX:
3196
            cmp[i]= c->dct_max[i];
3197
            break;
3198
        case FF_CMP_PSNR:
3199
            cmp[i]= c->quant_psnr[i];
3200
            break;
3201
        case FF_CMP_BIT:
3202
            cmp[i]= c->bit[i];
3203
            break;
3204
        case FF_CMP_RD:
3205
            cmp[i]= c->rd[i];
3206
            break;
3207
        case FF_CMP_VSAD:
3208
            cmp[i]= c->vsad[i];
3209
            break;
3210
        case FF_CMP_VSSE:
3211
            cmp[i]= c->vsse[i];
3212
            break;
3213
        case FF_CMP_ZERO:
3214
            cmp[i]= zero_cmp;
3215
            break;
3216
        case FF_CMP_NSSE:
3217
            cmp[i]= c->nsse[i];
3218
            break;
3219
        case FF_CMP_W53:
3220
            cmp[i]= c->w53[i];
3221
            break;
3222
        case FF_CMP_W97:
3223
            cmp[i]= c->w97[i];
3224
            break;
3225
        default:
3226
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3227
        }
3228
    }
3229
}
3230

    
3231
/**
3232
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3233
 */
3234
static void clear_blocks_c(DCTELEM *blocks)
3235
{
3236
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3237
}
3238

    
3239
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3240
    int i;
3241
    for(i=0; i+7<w; i+=8){
3242
        dst[i+0] += src[i+0];
3243
        dst[i+1] += src[i+1];
3244
        dst[i+2] += src[i+2];
3245
        dst[i+3] += src[i+3];
3246
        dst[i+4] += src[i+4];
3247
        dst[i+5] += src[i+5];
3248
        dst[i+6] += src[i+6];
3249
        dst[i+7] += src[i+7];
3250
    }
3251
    for(; i<w; i++)
3252
        dst[i+0] += src[i+0];
3253
}
3254

    
3255
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3256
    int i;
3257
    for(i=0; i+7<w; i+=8){
3258
        dst[i+0] = src1[i+0]-src2[i+0];
3259
        dst[i+1] = src1[i+1]-src2[i+1];
3260
        dst[i+2] = src1[i+2]-src2[i+2];
3261
        dst[i+3] = src1[i+3]-src2[i+3];
3262
        dst[i+4] = src1[i+4]-src2[i+4];
3263
        dst[i+5] = src1[i+5]-src2[i+5];
3264
        dst[i+6] = src1[i+6]-src2[i+6];
3265
        dst[i+7] = src1[i+7]-src2[i+7];
3266
    }
3267
    for(; i<w; i++)
3268
        dst[i+0] = src1[i+0]-src2[i+0];
3269
}
3270

    
3271
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3272
    int i;
3273
    uint8_t l, lt;
3274

    
3275
    l= *left;
3276
    lt= *left_top;
3277

    
3278
    for(i=0; i<w; i++){
3279
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3280
        lt= src1[i];
3281
        l= src2[i];
3282
        dst[i]= l - pred;
3283
    }
3284

    
3285
    *left= l;
3286
    *left_top= lt;
3287
}
3288

    
3289
#define BUTTERFLY2(o1,o2,i1,i2) \
3290
o1= (i1)+(i2);\
3291
o2= (i1)-(i2);
3292

    
3293
#define BUTTERFLY1(x,y) \
3294
{\
3295
    int a,b;\
3296
    a= x;\
3297
    b= y;\
3298
    x= a+b;\
3299
    y= a-b;\
3300
}
3301

    
3302
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3303

    
3304
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3305
    int i;
3306
    int temp[64];
3307
    int sum=0;
3308

    
3309
    assert(h==8);
3310

    
3311
    for(i=0; i<8; i++){
3312
        //FIXME try pointer walks
3313
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3314
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3315
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3316
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3317

    
3318
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3319
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3320
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3321
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3322

    
3323
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3324
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3325
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3326
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3327
    }
3328

    
3329
    for(i=0; i<8; i++){
3330
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3331
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3332
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3333
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3334

    
3335
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3336
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3337
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3338
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3339

    
3340
        sum +=
3341
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3342
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3343
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3344
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3345
    }
3346
#if 0
3347
static int maxi=0;
3348
if(sum>maxi){
3349
    maxi=sum;
3350
    printf("MAX:%d\n", maxi);
3351
}
3352
#endif
3353
    return sum;
3354
}
3355

    
3356
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3357
    int i;
3358
    int temp[64];
3359
    int sum=0;
3360

    
3361
    assert(h==8);
3362

    
3363
    for(i=0; i<8; i++){
3364
        //FIXME try pointer walks
3365
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3366
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3367
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3368
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3369

    
3370
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3371
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3372
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3373
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3374

    
3375
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3376
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3377
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3378
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3379
    }
3380

    
3381
    for(i=0; i<8; i++){
3382
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3383
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3384
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3385
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3386

    
3387
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3388
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3389
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3390
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3391

    
3392
        sum +=
3393
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3394
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3395
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3396
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3397
    }
3398

    
3399
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3400

    
3401
    return sum;
3402
}
3403

    
3404
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3405
    MpegEncContext * const s= (MpegEncContext *)c;
3406
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3407
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3408
    int sum=0, i;
3409

    
3410
    assert(h==8);
3411

    
3412
    s->dsp.diff_pixels(temp, src1, src2, stride);
3413
    s->dsp.fdct(temp);
3414

    
3415
    for(i=0; i<64; i++)
3416
        sum+= ABS(temp[i]);
3417

    
3418
    return sum;
3419
}
3420

    
3421
#ifdef CONFIG_GPL
3422
#define DCT8_1D {\
3423
    const int s07 = SRC(0) + SRC(7);\
3424
    const int s16 = SRC(1) + SRC(6);\
3425
    const int s25 = SRC(2) + SRC(5);\
3426
    const int s34 = SRC(3) + SRC(4);\
3427
    const int a0 = s07 + s34;\
3428
    const int a1 = s16 + s25;\
3429
    const int a2 = s07 - s34;\
3430
    const int a3 = s16 - s25;\
3431
    const int d07 = SRC(0) - SRC(7);\
3432
    const int d16 = SRC(1) - SRC(6);\
3433
    const int d25 = SRC(2) - SRC(5);\
3434
    const int d34 = SRC(3) - SRC(4);\
3435
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3436
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3437
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3438
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3439
    DST(0,  a0 + a1     ) ;\
3440
    DST(1,  a4 + (a7>>2)) ;\
3441
    DST(2,  a2 + (a3>>1)) ;\
3442
    DST(3,  a5 + (a6>>2)) ;\
3443
    DST(4,  a0 - a1     ) ;\
3444
    DST(5,  a6 - (a5>>2)) ;\
3445
    DST(6, (a2>>1) - a3 ) ;\
3446
    DST(7, (a4>>2) - a7 ) ;\
3447
}
3448

    
3449
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3450
    MpegEncContext * const s= (MpegEncContext *)c;
3451
    int16_t dct[8][8];
3452
    int i;
3453
    int sum=0;
3454

    
3455
    s->dsp.diff_pixels(dct, src1, src2, stride);
3456

    
3457
#define SRC(x) dct[i][x]
3458
#define DST(x,v) dct[i][x]= v
3459
    for( i = 0; i < 8; i++ )
3460
        DCT8_1D
3461
#undef SRC
3462
#undef DST
3463

    
3464
#define SRC(x) dct[x][i]
3465
#define DST(x,v) sum += ABS(v)
3466
    for( i = 0; i < 8; i++ )
3467
        DCT8_1D
3468
#undef SRC
3469
#undef DST
3470
    return sum;
3471
}
3472
#endif
3473

    
3474
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3475
    MpegEncContext * const s= (MpegEncContext *)c;
3476
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3477
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3478
    int sum=0, i;
3479

    
3480
    assert(h==8);
3481

    
3482
    s->dsp.diff_pixels(temp, src1, src2, stride);
3483
    s->dsp.fdct(temp);
3484

    
3485
    for(i=0; i<64; i++)
3486
        sum= FFMAX(sum, ABS(temp[i]));
3487

    
3488
    return sum;
3489
}
3490

    
3491
void simple_idct(DCTELEM *block); //FIXME
3492

    
3493
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494
    MpegEncContext * const s= (MpegEncContext *)c;
3495
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3496
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3497
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3498
    int sum=0, i;
3499

    
3500
    assert(h==8);
3501
    s->mb_intra=0;
3502

    
3503
    s->dsp.diff_pixels(temp, src1, src2, stride);
3504

    
3505
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3506

    
3507
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3508
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3509
    simple_idct(temp); //FIXME
3510

    
3511
    for(i=0; i<64; i++)
3512
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3513

    
3514
    return sum;
3515
}
3516

    
3517
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3518
    MpegEncContext * const s= (MpegEncContext *)c;
3519
    const uint8_t *scantable= s->intra_scantable.permutated;
3520
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3521
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3522
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3523
    uint8_t * const bak= (uint8_t*)aligned_bak;
3524
    int i, last, run, bits, level, distoration, start_i;
3525
    const int esc_length= s->ac_esc_length;
3526
    uint8_t * length;
3527
    uint8_t * last_length;
3528

    
3529
    assert(h==8);
3530

    
3531
    for(i=0; i<8; i++){
3532
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3533
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3534
    }
3535

    
3536
    s->dsp.diff_pixels(temp, src1, src2, stride);
3537

    
3538
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3539

    
3540
    bits=0;
3541

    
3542
    if (s->mb_intra) {
3543
        start_i = 1;
3544
        length     = s->intra_ac_vlc_length;
3545
        last_length= s->intra_ac_vlc_last_length;
3546
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3547
    } else {
3548
        start_i = 0;
3549
        length     = s->inter_ac_vlc_length;
3550
        last_length= s->inter_ac_vlc_last_length;
3551
    }
3552

    
3553
    if(last>=start_i){
3554
        run=0;
3555
        for(i=start_i; i<last; i++){
3556
            int j= scantable[i];
3557
            level= temp[j];
3558

    
3559
            if(level){
3560
                level+=64;
3561
                if((level&(~127)) == 0){
3562
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3563
                }else
3564
                    bits+= esc_length;
3565
                run=0;
3566
            }else
3567
                run++;
3568
        }
3569
        i= scantable[last];
3570

    
3571
        level= temp[i] + 64;
3572

    
3573
        assert(level - 64);
3574

    
3575
        if((level&(~127)) == 0){
3576
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3577
        }else
3578
            bits+= esc_length;
3579

    
3580
    }
3581

    
3582
    if(last>=0){
3583
        if(s->mb_intra)
3584
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3585
        else
3586
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3587
    }
3588

    
3589
    s->dsp.idct_add(bak, stride, temp);
3590

    
3591
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3592

    
3593
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3594
}
3595

    
3596
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3597
    MpegEncContext * const s= (MpegEncContext *)c;
3598
    const uint8_t *scantable= s->intra_scantable.permutated;
3599
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3600
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3601
    int i, last, run, bits, level, start_i;
3602
    const int esc_length= s->ac_esc_length;
3603
    uint8_t * length;
3604
    uint8_t * last_length;
3605

    
3606
    assert(h==8);
3607

    
3608
    s->dsp.diff_pixels(temp, src1, src2, stride);
3609

    
3610
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3611

    
3612
    bits=0;
3613

    
3614
    if (s->mb_intra) {
3615
        start_i = 1;
3616
        length     = s->intra_ac_vlc_length;
3617
        last_length= s->intra_ac_vlc_last_length;
3618
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3619
    } else {
3620
        start_i = 0;
3621
        length     = s->inter_ac_vlc_length;
3622
        last_length= s->inter_ac_vlc_last_length;
3623
    }
3624

    
3625
    if(last>=start_i){
3626
        run=0;
3627
        for(i=start_i; i<last; i++){
3628
            int j= scantable[i];
3629
            level= temp[j];
3630

    
3631
            if(level){
3632
                level+=64;
3633
                if((level&(~127)) == 0){
3634
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3635
                }else
3636
                    bits+= esc_length;
3637
                run=0;
3638
            }else
3639
                run++;
3640
        }
3641
        i= scantable[last];
3642

    
3643
        level= temp[i] + 64;
3644

    
3645
        assert(level - 64);
3646

    
3647
        if((level&(~127)) == 0){
3648
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3649
        }else
3650
            bits+= esc_length;
3651
    }
3652

    
3653
    return bits;
3654
}
3655

    
3656
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3657
    int score=0;
3658
    int x,y;
3659

    
3660
    for(y=1; y<h; y++){
3661
        for(x=0; x<16; x+=4){
3662
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3663
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3664
        }
3665
        s+= stride;
3666
    }
3667

    
3668
    return score;
3669
}
3670

    
3671
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3672
    int score=0;
3673
    int x,y;
3674

    
3675
    for(y=1; y<h; y++){
3676
        for(x=0; x<16; x++){
3677
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3678
        }
3679
        s1+= stride;
3680
        s2+= stride;
3681
    }
3682

    
3683
    return score;
3684
}
3685

    
3686
#define SQ(a) ((a)*(a))
3687
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3688
    int score=0;
3689
    int x,y;
3690

    
3691
    for(y=1; y<h; y++){
3692
        for(x=0; x<16; x+=4){
3693
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3694
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3695
        }
3696
        s+= stride;
3697
    }
3698

    
3699
    return score;
3700
}
3701

    
3702
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3703
    int score=0;
3704
    int x,y;
3705

    
3706
    for(y=1; y<h; y++){
3707
        for(x=0; x<16; x++){
3708
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3709
        }
3710
        s1+= stride;
3711
        s2+= stride;
3712
    }
3713

    
3714
    return score;
3715
}
3716

    
3717
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3718
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3719
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3720
#ifdef CONFIG_GPL
3721
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3722
#endif
3723
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3724
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3725
WARPER8_16_SQ(rd8x8_c, rd16_c)
3726
WARPER8_16_SQ(bit8x8_c, bit16_c)
3727

    
3728
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3729
 converted */
3730
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3731
{
3732
    j_rev_dct (block);
3733
    put_pixels_clamped_c(block, dest, line_size);
3734
}
3735
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3736
{
3737
    j_rev_dct (block);
3738
    add_pixels_clamped_c(block, dest, line_size);
3739
}
3740

    
3741
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3742
{
3743
    j_rev_dct4 (block);
3744
    put_pixels_clamped4_c(block, dest, line_size);
3745
}
3746
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3747
{
3748
    j_rev_dct4 (block);
3749
    add_pixels_clamped4_c(block, dest, line_size);
3750
}
3751

    
3752
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3753
{
3754
    j_rev_dct2 (block);
3755
    put_pixels_clamped2_c(block, dest, line_size);
3756
}
3757
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3758
{
3759
    j_rev_dct2 (block);
3760
    add_pixels_clamped2_c(block, dest, line_size);
3761
}
3762

    
3763
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3764
{
3765
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3766

    
3767
    dest[0] = cm[(block[0] + 4)>>3];
3768
}
3769
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3770
{
3771
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3772

    
3773
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3774
}
3775

    
3776
static void just_return() { return; }
3777

    
3778
/* init static data */
3779
void dsputil_static_init(void)
3780
{
3781
    int i;
3782

    
3783
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3784
    for(i=0;i<MAX_NEG_CROP;i++) {
3785
        cropTbl[i] = 0;
3786
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3787
    }
3788

    
3789
    for(i=0;i<512;i++) {
3790
        squareTbl[i] = (i - 256) * (i - 256);
3791
    }
3792

    
3793
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3794
}
3795

    
3796

    
3797
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3798
{
3799
    int i;
3800

    
3801
#ifdef CONFIG_ENCODERS
3802
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3803
        c->fdct = fdct_ifast;
3804
        c->fdct248 = fdct_ifast248;
3805
    }
3806
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3807
        c->fdct = ff_faandct;
3808
        c->fdct248 = ff_faandct248;
3809
    }
3810
    else {
3811
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3812
        c->fdct248 = ff_fdct248_islow;
3813
    }
3814
#endif //CONFIG_ENCODERS
3815

    
3816
    if(avctx->lowres==1){
3817
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3818
            c->idct_put= ff_jref_idct4_put;
3819
            c->idct_add= ff_jref_idct4_add;
3820
        }else{
3821
            c->idct_put= ff_h264_lowres_idct_put_c;
3822
            c->idct_add= ff_h264_lowres_idct_add_c;
3823
        }
3824
        c->idct    = j_rev_dct4;
3825
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3826
    }else if(avctx->lowres==2){
3827
        c->idct_put= ff_jref_idct2_put;
3828
        c->idct_add= ff_jref_idct2_add;
3829
        c->idct    = j_rev_dct2;
3830
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3831
    }else if(avctx->lowres==3){
3832
        c->idct_put= ff_jref_idct1_put;
3833
        c->idct_add= ff_jref_idct1_add;
3834
        c->idct    = j_rev_dct1;
3835
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3836
    }else{
3837
        if(avctx->idct_algo==FF_IDCT_INT){
3838
            c->idct_put= ff_jref_idct_put;
3839
            c->idct_add= ff_jref_idct_add;
3840
            c->idct    = j_rev_dct;
3841
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3842
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3843
            c->idct_put= ff_vp3_idct_put_c;
3844
            c->idct_add= ff_vp3_idct_add_c;
3845
            c->idct    = ff_vp3_idct_c;
3846
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3847
        }else{ //accurate/default
3848
            c->idct_put= simple_idct_put;
3849
            c->idct_add= simple_idct_add;
3850
            c->idct    = simple_idct;
3851
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3852
        }
3853
    }
3854

    
3855
    c->h264_idct_add= ff_h264_idct_add_c;
3856
    c->h264_idct8_add= ff_h264_idct8_add_c;
3857
    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3858
    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3859

    
3860
    c->get_pixels = get_pixels_c;
3861
    c->diff_pixels = diff_pixels_c;
3862
    c->put_pixels_clamped = put_pixels_clamped_c;
3863
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3864
    c->add_pixels_clamped = add_pixels_clamped_c;
3865
    c->add_pixels8 = add_pixels8_c;
3866
    c->add_pixels4 = add_pixels4_c;
3867
    c->gmc1 = gmc1_c;
3868
    c->gmc = gmc_c;
3869
    c->clear_blocks = clear_blocks_c;
3870
    c->pix_sum = pix_sum_c;
3871
    c->pix_norm1 = pix_norm1_c;
3872

    
3873
    /* TODO [0] 16  [1] 8 */
3874
    c->pix_abs[0][0] = pix_abs16_c;
3875
    c->pix_abs[0][1] = pix_abs16_x2_c;
3876
    c->pix_abs[0][2] = pix_abs16_y2_c;
3877
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3878
    c->pix_abs[1][0] = pix_abs8_c;
3879
    c->pix_abs[1][1] = pix_abs8_x2_c;
3880
    c->pix_abs[1][2] = pix_abs8_y2_c;
3881
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3882

    
3883
#define dspfunc(PFX, IDX, NUM) \
3884
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3885
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3886
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3887
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3888

    
3889
    dspfunc(put, 0, 16);
3890
    dspfunc(put_no_rnd, 0, 16);
3891
    dspfunc(put, 1, 8);
3892
    dspfunc(put_no_rnd, 1, 8);
3893
    dspfunc(put, 2, 4);
3894
    dspfunc(put, 3, 2);
3895

    
3896
    dspfunc(avg, 0, 16);
3897
    dspfunc(avg_no_rnd, 0, 16);
3898
    dspfunc(avg, 1, 8);
3899
    dspfunc(avg_no_rnd, 1, 8);
3900
    dspfunc(avg, 2, 4);
3901
    dspfunc(avg, 3, 2);
3902
#undef dspfunc
3903

    
3904
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3905
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3906

    
3907
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3908
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3909
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3910
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3911
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3912
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3913
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3914
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3915
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3916

    
3917
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3918
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3919
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3920
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3921
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3922
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3923
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3924
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3925
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3926

    
3927
#define dspfunc(PFX, IDX, NUM) \
3928
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3929
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3930
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3931
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3932
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3933
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3934
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3935
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3936
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3937
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3938
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3939
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3940
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3941
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3942
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3943
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3944

    
3945
    dspfunc(put_qpel, 0, 16);
3946
    dspfunc(put_no_rnd_qpel, 0, 16);
3947

    
3948
    dspfunc(avg_qpel, 0, 16);
3949
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3950

    
3951
    dspfunc(put_qpel, 1, 8);
3952
    dspfunc(put_no_rnd_qpel, 1, 8);
3953

    
3954
    dspfunc(avg_qpel, 1, 8);
3955
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3956

    
3957
    dspfunc(put_h264_qpel, 0, 16);
3958
    dspfunc(put_h264_qpel, 1, 8);
3959
    dspfunc(put_h264_qpel, 2, 4);
3960
    dspfunc(put_h264_qpel, 3, 2);
3961
    dspfunc(avg_h264_qpel, 0, 16);
3962
    dspfunc(avg_h264_qpel, 1, 8);
3963
    dspfunc(avg_h264_qpel, 2, 4);
3964

    
3965
#undef dspfunc
3966
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3967
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3968
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3969
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3970
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3971
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3972

    
3973
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3974
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3975
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3976
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3977
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3978
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3979
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3980
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3981
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3982
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3983
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3984
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3985
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3986
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3987
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3988
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3989
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3990
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3991
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3992
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3993

    
3994
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3995
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3996
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3997
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3998
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3999
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4000
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4001
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4002

    
4003
#define SET_CMP_FUNC(name) \
4004
    c->name[0]= name ## 16_c;\
4005
    c->name[1]= name ## 8x8_c;
4006

    
4007
    SET_CMP_FUNC(hadamard8_diff)
4008
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4009
    SET_CMP_FUNC(dct_sad)
4010
    SET_CMP_FUNC(dct_max)
4011
#ifdef CONFIG_GPL
4012
    SET_CMP_FUNC(dct264_sad)
4013
#endif
4014
    c->sad[0]= pix_abs16_c;
4015
    c->sad[1]= pix_abs8_c;
4016
    c->sse[0]= sse16_c;
4017
    c->sse[1]= sse8_c;
4018
    c->sse[2]= sse4_c;
4019
    SET_CMP_FUNC(quant_psnr)
4020
    SET_CMP_FUNC(rd)
4021
    SET_CMP_FUNC(bit)
4022
    c->vsad[0]= vsad16_c;
4023
    c->vsad[4]= vsad_intra16_c;
4024
    c->vsse[0]= vsse16_c;
4025
    c->vsse[4]= vsse_intra16_c;
4026
    c->nsse[0]= nsse16_c;
4027
    c->nsse[1]= nsse8_c;
4028
    c->w53[0]= w53_16_c;
4029
    c->w53[1]= w53_8_c;
4030
    c->w97[0]= w97_16_c;
4031
    c->w97[1]= w97_8_c;
4032

    
4033
    c->add_bytes= add_bytes_c;
4034
    c->diff_bytes= diff_bytes_c;
4035
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4036
    c->bswap_buf= bswap_buf;
4037

    
4038
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4039
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4040
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4041
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4042
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4043
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4044

    
4045
    c->h263_h_loop_filter= h263_h_loop_filter_c;
4046
    c->h263_v_loop_filter= h263_v_loop_filter_c;
4047

    
4048
    c->h261_loop_filter= h261_loop_filter_c;
4049

    
4050
    c->try_8x8basis= try_8x8basis_c;
4051
    c->add_8x8basis= add_8x8basis_c;
4052

    
4053
#ifdef CONFIG_SNOW_ENCODER
4054
    c->vertical_compose97i = ff_snow_vertical_compose97i;
4055
    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4056
    c->inner_add_yblock = ff_snow_inner_add_yblock;
4057
#endif
4058

    
4059
    c->prefetch= just_return;
4060

    
4061
#ifdef HAVE_MMX
4062
    dsputil_init_mmx(c, avctx);
4063
#endif
4064
#ifdef ARCH_ARMV4L
4065
    dsputil_init_armv4l(c, avctx);
4066
#endif
4067
#ifdef HAVE_MLIB
4068
    dsputil_init_mlib(c, avctx);
4069
#endif
4070
#ifdef ARCH_SPARC
4071
   dsputil_init_vis(c,avctx);
4072
#endif
4073
#ifdef ARCH_ALPHA
4074
    dsputil_init_alpha(c, avctx);
4075
#endif
4076
#ifdef ARCH_POWERPC
4077
    dsputil_init_ppc(c, avctx);
4078
#endif
4079
#ifdef HAVE_MMI
4080
    dsputil_init_mmi(c, avctx);
4081
#endif
4082
#ifdef ARCH_SH4
4083
    dsputil_init_sh4(c,avctx);
4084
#endif
4085

    
4086
    switch(c->idct_permutation_type){
4087
    case FF_NO_IDCT_PERM:
4088
        for(i=0; i<64; i++)
4089
            c->idct_permutation[i]= i;
4090
        break;
4091
    case FF_LIBMPEG2_IDCT_PERM:
4092
        for(i=0; i<64; i++)
4093
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4094
        break;
4095
    case FF_SIMPLE_IDCT_PERM:
4096
        for(i=0; i<64; i++)
4097
            c->idct_permutation[i]= simple_mmx_permutation[i];
4098
        break;
4099
    case FF_TRANSPOSE_IDCT_PERM:
4100
        for(i=0; i<64; i++)
4101
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4102
        break;
4103
    case FF_PARTTRANS_IDCT_PERM:
4104
        for(i=0; i<64; i++)
4105
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4106
        break;
4107
    default:
4108
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4109
    }
4110
}
4111