Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ b482e2d1

History | View | Annotate | Download (145 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21
 */
22

    
23
/**
24
 * @file dsputil.c
25
 * DSP utils
26
 */
27

    
28
#include "avcodec.h"
29
#include "dsputil.h"
30
#include "mpegvideo.h"
31
#include "simple_idct.h"
32
#include "faandct.h"
33
#include "snow.h"
34

    
35
/* snow.c */
36
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37

    
38
uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39
uint32_t squareTbl[512] = {0, };
40

    
41
const uint8_t ff_zigzag_direct[64] = {
42
    0,   1,  8, 16,  9,  2,  3, 10,
43
    17, 24, 32, 25, 18, 11,  4,  5,
44
    12, 19, 26, 33, 40, 48, 41, 34,
45
    27, 20, 13,  6,  7, 14, 21, 28,
46
    35, 42, 49, 56, 57, 50, 43, 36,
47
    29, 22, 15, 23, 30, 37, 44, 51,
48
    58, 59, 52, 45, 38, 31, 39, 46,
49
    53, 60, 61, 54, 47, 55, 62, 63
50
};
51

    
52
/* Specific zigzag scan for 248 idct. NOTE that unlike the
53
   specification, we interleave the fields */
54
const uint8_t ff_zigzag248_direct[64] = {
55
     0,  8,  1,  9, 16, 24,  2, 10,
56
    17, 25, 32, 40, 48, 56, 33, 41,
57
    18, 26,  3, 11,  4, 12, 19, 27,
58
    34, 42, 49, 57, 50, 58, 35, 43,
59
    20, 28,  5, 13,  6, 14, 21, 29,
60
    36, 44, 51, 59, 52, 60, 37, 45,
61
    22, 30,  7, 15, 23, 31, 38, 46,
62
    53, 61, 54, 62, 39, 47, 55, 63,
63
};
64

    
65
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
67

    
68
const uint8_t ff_alternate_horizontal_scan[64] = {
69
    0,  1,   2,  3,  8,  9, 16, 17,
70
    10, 11,  4,  5,  6,  7, 15, 14,
71
    13, 12, 19, 18, 24, 25, 32, 33,
72
    26, 27, 20, 21, 22, 23, 28, 29,
73
    30, 31, 34, 35, 40, 41, 48, 49,
74
    42, 43, 36, 37, 38, 39, 44, 45,
75
    46, 47, 50, 51, 56, 57, 58, 59,
76
    52, 53, 54, 55, 60, 61, 62, 63,
77
};
78

    
79
const uint8_t ff_alternate_vertical_scan[64] = {
80
    0,  8,  16, 24,  1,  9,  2, 10,
81
    17, 25, 32, 40, 48, 56, 57, 49,
82
    41, 33, 26, 18,  3, 11,  4, 12,
83
    19, 27, 34, 42, 50, 58, 35, 43,
84
    51, 59, 20, 28,  5, 13,  6, 14,
85
    21, 29, 36, 44, 52, 60, 37, 45,
86
    53, 61, 22, 30,  7, 15, 23, 31,
87
    38, 46, 54, 62, 39, 47, 55, 63,
88
};
89

    
90
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
91
const uint32_t inverse[256]={
92
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
93
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
94
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
95
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
96
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
97
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
98
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
99
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
100
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
101
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
102
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
103
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
104
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
105
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
106
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
107
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
108
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
109
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
110
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
111
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
112
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
113
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
114
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
115
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
116
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
117
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
118
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
119
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
120
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
121
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
122
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
123
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
124
};
125

    
126
/* Input permutation for the simple_idct_mmx */
127
static const uint8_t simple_mmx_permutation[64]={
128
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
136
};
137

    
138
static int pix_sum_c(uint8_t * pix, int line_size)
139
{
140
    int s, i, j;
141

    
142
    s = 0;
143
    for (i = 0; i < 16; i++) {
144
        for (j = 0; j < 16; j += 8) {
145
            s += pix[0];
146
            s += pix[1];
147
            s += pix[2];
148
            s += pix[3];
149
            s += pix[4];
150
            s += pix[5];
151
            s += pix[6];
152
            s += pix[7];
153
            pix += 8;
154
        }
155
        pix += line_size - 16;
156
    }
157
    return s;
158
}
159

    
160
static int pix_norm1_c(uint8_t * pix, int line_size)
161
{
162
    int s, i, j;
163
    uint32_t *sq = squareTbl + 256;
164

    
165
    s = 0;
166
    for (i = 0; i < 16; i++) {
167
        for (j = 0; j < 16; j += 8) {
168
#if 0
169
            s += sq[pix[0]];
170
            s += sq[pix[1]];
171
            s += sq[pix[2]];
172
            s += sq[pix[3]];
173
            s += sq[pix[4]];
174
            s += sq[pix[5]];
175
            s += sq[pix[6]];
176
            s += sq[pix[7]];
177
#else
178
#if LONG_MAX > 2147483647
179
            register uint64_t x=*(uint64_t*)pix;
180
            s += sq[x&0xff];
181
            s += sq[(x>>8)&0xff];
182
            s += sq[(x>>16)&0xff];
183
            s += sq[(x>>24)&0xff];
184
            s += sq[(x>>32)&0xff];
185
            s += sq[(x>>40)&0xff];
186
            s += sq[(x>>48)&0xff];
187
            s += sq[(x>>56)&0xff];
188
#else
189
            register uint32_t x=*(uint32_t*)pix;
190
            s += sq[x&0xff];
191
            s += sq[(x>>8)&0xff];
192
            s += sq[(x>>16)&0xff];
193
            s += sq[(x>>24)&0xff];
194
            x=*(uint32_t*)(pix+4);
195
            s += sq[x&0xff];
196
            s += sq[(x>>8)&0xff];
197
            s += sq[(x>>16)&0xff];
198
            s += sq[(x>>24)&0xff];
199
#endif
200
#endif
201
            pix += 8;
202
        }
203
        pix += line_size - 16;
204
    }
205
    return s;
206
}
207

    
208
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
209
    int i;
210

    
211
    for(i=0; i+8<=w; i+=8){
212
        dst[i+0]= bswap_32(src[i+0]);
213
        dst[i+1]= bswap_32(src[i+1]);
214
        dst[i+2]= bswap_32(src[i+2]);
215
        dst[i+3]= bswap_32(src[i+3]);
216
        dst[i+4]= bswap_32(src[i+4]);
217
        dst[i+5]= bswap_32(src[i+5]);
218
        dst[i+6]= bswap_32(src[i+6]);
219
        dst[i+7]= bswap_32(src[i+7]);
220
    }
221
    for(;i<w; i++){
222
        dst[i+0]= bswap_32(src[i+0]);
223
    }
224
}
225

    
226
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227
{
228
    int s, i;
229
    uint32_t *sq = squareTbl + 256;
230

    
231
    s = 0;
232
    for (i = 0; i < h; i++) {
233
        s += sq[pix1[0] - pix2[0]];
234
        s += sq[pix1[1] - pix2[1]];
235
        s += sq[pix1[2] - pix2[2]];
236
        s += sq[pix1[3] - pix2[3]];
237
        pix1 += line_size;
238
        pix2 += line_size;
239
    }
240
    return s;
241
}
242

    
243
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244
{
245
    int s, i;
246
    uint32_t *sq = squareTbl + 256;
247

    
248
    s = 0;
249
    for (i = 0; i < h; i++) {
250
        s += sq[pix1[0] - pix2[0]];
251
        s += sq[pix1[1] - pix2[1]];
252
        s += sq[pix1[2] - pix2[2]];
253
        s += sq[pix1[3] - pix2[3]];
254
        s += sq[pix1[4] - pix2[4]];
255
        s += sq[pix1[5] - pix2[5]];
256
        s += sq[pix1[6] - pix2[6]];
257
        s += sq[pix1[7] - pix2[7]];
258
        pix1 += line_size;
259
        pix2 += line_size;
260
    }
261
    return s;
262
}
263

    
264
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
265
{
266
    int s, i;
267
    uint32_t *sq = squareTbl + 256;
268

    
269
    s = 0;
270
    for (i = 0; i < h; i++) {
271
        s += sq[pix1[ 0] - pix2[ 0]];
272
        s += sq[pix1[ 1] - pix2[ 1]];
273
        s += sq[pix1[ 2] - pix2[ 2]];
274
        s += sq[pix1[ 3] - pix2[ 3]];
275
        s += sq[pix1[ 4] - pix2[ 4]];
276
        s += sq[pix1[ 5] - pix2[ 5]];
277
        s += sq[pix1[ 6] - pix2[ 6]];
278
        s += sq[pix1[ 7] - pix2[ 7]];
279
        s += sq[pix1[ 8] - pix2[ 8]];
280
        s += sq[pix1[ 9] - pix2[ 9]];
281
        s += sq[pix1[10] - pix2[10]];
282
        s += sq[pix1[11] - pix2[11]];
283
        s += sq[pix1[12] - pix2[12]];
284
        s += sq[pix1[13] - pix2[13]];
285
        s += sq[pix1[14] - pix2[14]];
286
        s += sq[pix1[15] - pix2[15]];
287

    
288
        pix1 += line_size;
289
        pix2 += line_size;
290
    }
291
    return s;
292
}
293

    
294

    
295
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
296
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
297
    int s, i, j;
298
    const int dec_count= w==8 ? 3 : 4;
299
    int tmp[32*32];
300
    int level, ori;
301
    static const int scale[2][2][4][4]={
302
      {
303
        {
304
            // 9/7 8x8 dec=3
305
            {268, 239, 239, 213},
306
            {  0, 224, 224, 152},
307
            {  0, 135, 135, 110},
308
        },{
309
            // 9/7 16x16 or 32x32 dec=4
310
            {344, 310, 310, 280},
311
            {  0, 320, 320, 228},
312
            {  0, 175, 175, 136},
313
            {  0, 129, 129, 102},
314
        }
315
      },{
316
        {
317
            // 5/3 8x8 dec=3
318
            {275, 245, 245, 218},
319
            {  0, 230, 230, 156},
320
            {  0, 138, 138, 113},
321
        },{
322
            // 5/3 16x16 or 32x32 dec=4
323
            {352, 317, 317, 286},
324
            {  0, 328, 328, 233},
325
            {  0, 180, 180, 140},
326
            {  0, 132, 132, 105},
327
        }
328
      }
329
    };
330

    
331
    for (i = 0; i < h; i++) {
332
        for (j = 0; j < w; j+=4) {
333
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
337
        }
338
        pix1 += line_size;
339
        pix2 += line_size;
340
    }
341

    
342
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
343

    
344
    s=0;
345
    assert(w==h);
346
    for(level=0; level<dec_count; level++){
347
        for(ori= level ? 1 : 0; ori<4; ori++){
348
            int size= w>>(dec_count-level);
349
            int sx= (ori&1) ? size : 0;
350
            int stride= 32<<(dec_count-level);
351
            int sy= (ori&2) ? stride>>1 : 0;
352

    
353
            for(i=0; i<size; i++){
354
                for(j=0; j<size; j++){
355
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
356
                    s += ABS(v);
357
                }
358
            }
359
        }
360
    }
361
    assert(s>=0);
362
    return s>>9;
363
}
364

    
365
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
366
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
367
}
368

    
369
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
370
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
371
}
372

    
373
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
374
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
375
}
376

    
377
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
379
}
380

    
381
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
383
}
384

    
385
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
387
}
388
#endif
389

    
390
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
391
{
392
    int i;
393

    
394
    /* read the pixels */
395
    for(i=0;i<8;i++) {
396
        block[0] = pixels[0];
397
        block[1] = pixels[1];
398
        block[2] = pixels[2];
399
        block[3] = pixels[3];
400
        block[4] = pixels[4];
401
        block[5] = pixels[5];
402
        block[6] = pixels[6];
403
        block[7] = pixels[7];
404
        pixels += line_size;
405
        block += 8;
406
    }
407
}
408

    
409
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
410
                          const uint8_t *s2, int stride){
411
    int i;
412

    
413
    /* read the pixels */
414
    for(i=0;i<8;i++) {
415
        block[0] = s1[0] - s2[0];
416
        block[1] = s1[1] - s2[1];
417
        block[2] = s1[2] - s2[2];
418
        block[3] = s1[3] - s2[3];
419
        block[4] = s1[4] - s2[4];
420
        block[5] = s1[5] - s2[5];
421
        block[6] = s1[6] - s2[6];
422
        block[7] = s1[7] - s2[7];
423
        s1 += stride;
424
        s2 += stride;
425
        block += 8;
426
    }
427
}
428

    
429

    
430
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431
                                 int line_size)
432
{
433
    int i;
434
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
435

    
436
    /* read the pixels */
437
    for(i=0;i<8;i++) {
438
        pixels[0] = cm[block[0]];
439
        pixels[1] = cm[block[1]];
440
        pixels[2] = cm[block[2]];
441
        pixels[3] = cm[block[3]];
442
        pixels[4] = cm[block[4]];
443
        pixels[5] = cm[block[5]];
444
        pixels[6] = cm[block[6]];
445
        pixels[7] = cm[block[7]];
446

    
447
        pixels += line_size;
448
        block += 8;
449
    }
450
}
451

    
452
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
453
                                 int line_size)
454
{
455
    int i;
456
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
457

    
458
    /* read the pixels */
459
    for(i=0;i<4;i++) {
460
        pixels[0] = cm[block[0]];
461
        pixels[1] = cm[block[1]];
462
        pixels[2] = cm[block[2]];
463
        pixels[3] = cm[block[3]];
464

    
465
        pixels += line_size;
466
        block += 8;
467
    }
468
}
469

    
470
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
471
                                 int line_size)
472
{
473
    int i;
474
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
475

    
476
    /* read the pixels */
477
    for(i=0;i<2;i++) {
478
        pixels[0] = cm[block[0]];
479
        pixels[1] = cm[block[1]];
480

    
481
        pixels += line_size;
482
        block += 8;
483
    }
484
}
485

    
486
static void put_signed_pixels_clamped_c(const DCTELEM *block,
487
                                        uint8_t *restrict pixels,
488
                                        int line_size)
489
{
490
    int i, j;
491

    
492
    for (i = 0; i < 8; i++) {
493
        for (j = 0; j < 8; j++) {
494
            if (*block < -128)
495
                *pixels = 0;
496
            else if (*block > 127)
497
                *pixels = 255;
498
            else
499
                *pixels = (uint8_t)(*block + 128);
500
            block++;
501
            pixels++;
502
        }
503
        pixels += (line_size - 8);
504
    }
505
}
506

    
507
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
508
                          int line_size)
509
{
510
    int i;
511
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
512

    
513
    /* read the pixels */
514
    for(i=0;i<8;i++) {
515
        pixels[0] = cm[pixels[0] + block[0]];
516
        pixels[1] = cm[pixels[1] + block[1]];
517
        pixels[2] = cm[pixels[2] + block[2]];
518
        pixels[3] = cm[pixels[3] + block[3]];
519
        pixels[4] = cm[pixels[4] + block[4]];
520
        pixels[5] = cm[pixels[5] + block[5]];
521
        pixels[6] = cm[pixels[6] + block[6]];
522
        pixels[7] = cm[pixels[7] + block[7]];
523
        pixels += line_size;
524
        block += 8;
525
    }
526
}
527

    
528
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
529
                          int line_size)
530
{
531
    int i;
532
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
533

    
534
    /* read the pixels */
535
    for(i=0;i<4;i++) {
536
        pixels[0] = cm[pixels[0] + block[0]];
537
        pixels[1] = cm[pixels[1] + block[1]];
538
        pixels[2] = cm[pixels[2] + block[2]];
539
        pixels[3] = cm[pixels[3] + block[3]];
540
        pixels += line_size;
541
        block += 8;
542
    }
543
}
544

    
545
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
546
                          int line_size)
547
{
548
    int i;
549
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
550

    
551
    /* read the pixels */
552
    for(i=0;i<2;i++) {
553
        pixels[0] = cm[pixels[0] + block[0]];
554
        pixels[1] = cm[pixels[1] + block[1]];
555
        pixels += line_size;
556
        block += 8;
557
    }
558
}
559

    
560
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
561
{
562
    int i;
563
    for(i=0;i<8;i++) {
564
        pixels[0] += block[0];
565
        pixels[1] += block[1];
566
        pixels[2] += block[2];
567
        pixels[3] += block[3];
568
        pixels[4] += block[4];
569
        pixels[5] += block[5];
570
        pixels[6] += block[6];
571
        pixels[7] += block[7];
572
        pixels += line_size;
573
        block += 8;
574
    }
575
}
576

    
577
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
578
{
579
    int i;
580
    for(i=0;i<4;i++) {
581
        pixels[0] += block[0];
582
        pixels[1] += block[1];
583
        pixels[2] += block[2];
584
        pixels[3] += block[3];
585
        pixels += line_size;
586
        block += 4;
587
    }
588
}
589

    
590
#if 0
591

592
#define PIXOP2(OPNAME, OP) \
593
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
594
{\
595
    int i;\
596
    for(i=0; i<h; i++){\
597
        OP(*((uint64_t*)block), LD64(pixels));\
598
        pixels+=line_size;\
599
        block +=line_size;\
600
    }\
601
}\
602
\
603
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
604
{\
605
    int i;\
606
    for(i=0; i<h; i++){\
607
        const uint64_t a= LD64(pixels  );\
608
        const uint64_t b= LD64(pixels+1);\
609
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
610
        pixels+=line_size;\
611
        block +=line_size;\
612
    }\
613
}\
614
\
615
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
616
{\
617
    int i;\
618
    for(i=0; i<h; i++){\
619
        const uint64_t a= LD64(pixels  );\
620
        const uint64_t b= LD64(pixels+1);\
621
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
622
        pixels+=line_size;\
623
        block +=line_size;\
624
    }\
625
}\
626
\
627
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
628
{\
629
    int i;\
630
    for(i=0; i<h; i++){\
631
        const uint64_t a= LD64(pixels          );\
632
        const uint64_t b= LD64(pixels+line_size);\
633
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
634
        pixels+=line_size;\
635
        block +=line_size;\
636
    }\
637
}\
638
\
639
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
640
{\
641
    int i;\
642
    for(i=0; i<h; i++){\
643
        const uint64_t a= LD64(pixels          );\
644
        const uint64_t b= LD64(pixels+line_size);\
645
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
646
        pixels+=line_size;\
647
        block +=line_size;\
648
    }\
649
}\
650
\
651
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
652
{\
653
        int i;\
654
        const uint64_t a= LD64(pixels  );\
655
        const uint64_t b= LD64(pixels+1);\
656
        uint64_t l0=  (a&0x0303030303030303ULL)\
657
                    + (b&0x0303030303030303ULL)\
658
                    + 0x0202020202020202ULL;\
659
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
660
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
661
        uint64_t l1,h1;\
662
\
663
        pixels+=line_size;\
664
        for(i=0; i<h; i+=2){\
665
            uint64_t a= LD64(pixels  );\
666
            uint64_t b= LD64(pixels+1);\
667
            l1=  (a&0x0303030303030303ULL)\
668
               + (b&0x0303030303030303ULL);\
669
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
670
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
671
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
672
            pixels+=line_size;\
673
            block +=line_size;\
674
            a= LD64(pixels  );\
675
            b= LD64(pixels+1);\
676
            l0=  (a&0x0303030303030303ULL)\
677
               + (b&0x0303030303030303ULL)\
678
               + 0x0202020202020202ULL;\
679
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
680
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
682
            pixels+=line_size;\
683
            block +=line_size;\
684
        }\
685
}\
686
\
687
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
688
{\
689
        int i;\
690
        const uint64_t a= LD64(pixels  );\
691
        const uint64_t b= LD64(pixels+1);\
692
        uint64_t l0=  (a&0x0303030303030303ULL)\
693
                    + (b&0x0303030303030303ULL)\
694
                    + 0x0101010101010101ULL;\
695
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
696
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
697
        uint64_t l1,h1;\
698
\
699
        pixels+=line_size;\
700
        for(i=0; i<h; i+=2){\
701
            uint64_t a= LD64(pixels  );\
702
            uint64_t b= LD64(pixels+1);\
703
            l1=  (a&0x0303030303030303ULL)\
704
               + (b&0x0303030303030303ULL);\
705
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
706
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
707
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
708
            pixels+=line_size;\
709
            block +=line_size;\
710
            a= LD64(pixels  );\
711
            b= LD64(pixels+1);\
712
            l0=  (a&0x0303030303030303ULL)\
713
               + (b&0x0303030303030303ULL)\
714
               + 0x0101010101010101ULL;\
715
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
716
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
718
            pixels+=line_size;\
719
            block +=line_size;\
720
        }\
721
}\
722
\
723
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
724
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
725
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
726
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
727
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
728
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
729
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
730

731
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
732
#else // 64 bit variant
733

    
734
#define PIXOP2(OPNAME, OP) \
735
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
736
    int i;\
737
    for(i=0; i<h; i++){\
738
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
739
        pixels+=line_size;\
740
        block +=line_size;\
741
    }\
742
}\
743
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
744
    int i;\
745
    for(i=0; i<h; i++){\
746
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
747
        pixels+=line_size;\
748
        block +=line_size;\
749
    }\
750
}\
751
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
752
    int i;\
753
    for(i=0; i<h; i++){\
754
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
755
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
756
        pixels+=line_size;\
757
        block +=line_size;\
758
    }\
759
}\
760
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
762
}\
763
\
764
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765
                                                int src_stride1, int src_stride2, int h){\
766
    int i;\
767
    for(i=0; i<h; i++){\
768
        uint32_t a,b;\
769
        a= LD32(&src1[i*src_stride1  ]);\
770
        b= LD32(&src2[i*src_stride2  ]);\
771
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
772
        a= LD32(&src1[i*src_stride1+4]);\
773
        b= LD32(&src2[i*src_stride2+4]);\
774
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
775
    }\
776
}\
777
\
778
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779
                                                int src_stride1, int src_stride2, int h){\
780
    int i;\
781
    for(i=0; i<h; i++){\
782
        uint32_t a,b;\
783
        a= LD32(&src1[i*src_stride1  ]);\
784
        b= LD32(&src2[i*src_stride2  ]);\
785
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
786
        a= LD32(&src1[i*src_stride1+4]);\
787
        b= LD32(&src2[i*src_stride2+4]);\
788
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
789
    }\
790
}\
791
\
792
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793
                                                int src_stride1, int src_stride2, int h){\
794
    int i;\
795
    for(i=0; i<h; i++){\
796
        uint32_t a,b;\
797
        a= LD32(&src1[i*src_stride1  ]);\
798
        b= LD32(&src2[i*src_stride2  ]);\
799
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
800
    }\
801
}\
802
\
803
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
804
                                                int src_stride1, int src_stride2, int h){\
805
    int i;\
806
    for(i=0; i<h; i++){\
807
        uint32_t a,b;\
808
        a= LD16(&src1[i*src_stride1  ]);\
809
        b= LD16(&src2[i*src_stride2  ]);\
810
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
811
    }\
812
}\
813
\
814
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
815
                                                int src_stride1, int src_stride2, int h){\
816
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
817
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
818
}\
819
\
820
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821
                                                int src_stride1, int src_stride2, int h){\
822
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
823
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
824
}\
825
\
826
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
827
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
828
}\
829
\
830
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832
}\
833
\
834
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
836
}\
837
\
838
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840
}\
841
\
842
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
843
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
844
    int i;\
845
    for(i=0; i<h; i++){\
846
        uint32_t a, b, c, d, l0, l1, h0, h1;\
847
        a= LD32(&src1[i*src_stride1]);\
848
        b= LD32(&src2[i*src_stride2]);\
849
        c= LD32(&src3[i*src_stride3]);\
850
        d= LD32(&src4[i*src_stride4]);\
851
        l0=  (a&0x03030303UL)\
852
           + (b&0x03030303UL)\
853
           + 0x02020202UL;\
854
        h0= ((a&0xFCFCFCFCUL)>>2)\
855
          + ((b&0xFCFCFCFCUL)>>2);\
856
        l1=  (c&0x03030303UL)\
857
           + (d&0x03030303UL);\
858
        h1= ((c&0xFCFCFCFCUL)>>2)\
859
          + ((d&0xFCFCFCFCUL)>>2);\
860
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
861
        a= LD32(&src1[i*src_stride1+4]);\
862
        b= LD32(&src2[i*src_stride2+4]);\
863
        c= LD32(&src3[i*src_stride3+4]);\
864
        d= LD32(&src4[i*src_stride4+4]);\
865
        l0=  (a&0x03030303UL)\
866
           + (b&0x03030303UL)\
867
           + 0x02020202UL;\
868
        h0= ((a&0xFCFCFCFCUL)>>2)\
869
          + ((b&0xFCFCFCFCUL)>>2);\
870
        l1=  (c&0x03030303UL)\
871
           + (d&0x03030303UL);\
872
        h1= ((c&0xFCFCFCFCUL)>>2)\
873
          + ((d&0xFCFCFCFCUL)>>2);\
874
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
875
    }\
876
}\
877
\
878
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
879
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
880
}\
881
\
882
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
884
}\
885
\
886
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
888
}\
889
\
890
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
892
}\
893
\
894
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
895
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
896
    int i;\
897
    for(i=0; i<h; i++){\
898
        uint32_t a, b, c, d, l0, l1, h0, h1;\
899
        a= LD32(&src1[i*src_stride1]);\
900
        b= LD32(&src2[i*src_stride2]);\
901
        c= LD32(&src3[i*src_stride3]);\
902
        d= LD32(&src4[i*src_stride4]);\
903
        l0=  (a&0x03030303UL)\
904
           + (b&0x03030303UL)\
905
           + 0x01010101UL;\
906
        h0= ((a&0xFCFCFCFCUL)>>2)\
907
          + ((b&0xFCFCFCFCUL)>>2);\
908
        l1=  (c&0x03030303UL)\
909
           + (d&0x03030303UL);\
910
        h1= ((c&0xFCFCFCFCUL)>>2)\
911
          + ((d&0xFCFCFCFCUL)>>2);\
912
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
913
        a= LD32(&src1[i*src_stride1+4]);\
914
        b= LD32(&src2[i*src_stride2+4]);\
915
        c= LD32(&src3[i*src_stride3+4]);\
916
        d= LD32(&src4[i*src_stride4+4]);\
917
        l0=  (a&0x03030303UL)\
918
           + (b&0x03030303UL)\
919
           + 0x01010101UL;\
920
        h0= ((a&0xFCFCFCFCUL)>>2)\
921
          + ((b&0xFCFCFCFCUL)>>2);\
922
        l1=  (c&0x03030303UL)\
923
           + (d&0x03030303UL);\
924
        h1= ((c&0xFCFCFCFCUL)>>2)\
925
          + ((d&0xFCFCFCFCUL)>>2);\
926
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
927
    }\
928
}\
929
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
930
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
931
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
932
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
933
}\
934
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938
}\
939
\
940
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
941
{\
942
        int i, a0, b0, a1, b1;\
943
        a0= pixels[0];\
944
        b0= pixels[1] + 2;\
945
        a0 += b0;\
946
        b0 += pixels[2];\
947
\
948
        pixels+=line_size;\
949
        for(i=0; i<h; i+=2){\
950
            a1= pixels[0];\
951
            b1= pixels[1];\
952
            a1 += b1;\
953
            b1 += pixels[2];\
954
\
955
            block[0]= (a1+a0)>>2; /* FIXME non put */\
956
            block[1]= (b1+b0)>>2;\
957
\
958
            pixels+=line_size;\
959
            block +=line_size;\
960
\
961
            a0= pixels[0];\
962
            b0= pixels[1] + 2;\
963
            a0 += b0;\
964
            b0 += pixels[2];\
965
\
966
            block[0]= (a1+a0)>>2;\
967
            block[1]= (b1+b0)>>2;\
968
            pixels+=line_size;\
969
            block +=line_size;\
970
        }\
971
}\
972
\
973
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
974
{\
975
        int i;\
976
        const uint32_t a= LD32(pixels  );\
977
        const uint32_t b= LD32(pixels+1);\
978
        uint32_t l0=  (a&0x03030303UL)\
979
                    + (b&0x03030303UL)\
980
                    + 0x02020202UL;\
981
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
982
                   + ((b&0xFCFCFCFCUL)>>2);\
983
        uint32_t l1,h1;\
984
\
985
        pixels+=line_size;\
986
        for(i=0; i<h; i+=2){\
987
            uint32_t a= LD32(pixels  );\
988
            uint32_t b= LD32(pixels+1);\
989
            l1=  (a&0x03030303UL)\
990
               + (b&0x03030303UL);\
991
            h1= ((a&0xFCFCFCFCUL)>>2)\
992
              + ((b&0xFCFCFCFCUL)>>2);\
993
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
994
            pixels+=line_size;\
995
            block +=line_size;\
996
            a= LD32(pixels  );\
997
            b= LD32(pixels+1);\
998
            l0=  (a&0x03030303UL)\
999
               + (b&0x03030303UL)\
1000
               + 0x02020202UL;\
1001
            h0= ((a&0xFCFCFCFCUL)>>2)\
1002
              + ((b&0xFCFCFCFCUL)>>2);\
1003
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004
            pixels+=line_size;\
1005
            block +=line_size;\
1006
        }\
1007
}\
1008
\
1009
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010
{\
1011
    int j;\
1012
    for(j=0; j<2; j++){\
1013
        int i;\
1014
        const uint32_t a= LD32(pixels  );\
1015
        const uint32_t b= LD32(pixels+1);\
1016
        uint32_t l0=  (a&0x03030303UL)\
1017
                    + (b&0x03030303UL)\
1018
                    + 0x02020202UL;\
1019
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020
                   + ((b&0xFCFCFCFCUL)>>2);\
1021
        uint32_t l1,h1;\
1022
\
1023
        pixels+=line_size;\
1024
        for(i=0; i<h; i+=2){\
1025
            uint32_t a= LD32(pixels  );\
1026
            uint32_t b= LD32(pixels+1);\
1027
            l1=  (a&0x03030303UL)\
1028
               + (b&0x03030303UL);\
1029
            h1= ((a&0xFCFCFCFCUL)>>2)\
1030
              + ((b&0xFCFCFCFCUL)>>2);\
1031
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032
            pixels+=line_size;\
1033
            block +=line_size;\
1034
            a= LD32(pixels  );\
1035
            b= LD32(pixels+1);\
1036
            l0=  (a&0x03030303UL)\
1037
               + (b&0x03030303UL)\
1038
               + 0x02020202UL;\
1039
            h0= ((a&0xFCFCFCFCUL)>>2)\
1040
              + ((b&0xFCFCFCFCUL)>>2);\
1041
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042
            pixels+=line_size;\
1043
            block +=line_size;\
1044
        }\
1045
        pixels+=4-line_size*(h+1);\
1046
        block +=4-line_size*h;\
1047
    }\
1048
}\
1049
\
1050
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1051
{\
1052
    int j;\
1053
    for(j=0; j<2; j++){\
1054
        int i;\
1055
        const uint32_t a= LD32(pixels  );\
1056
        const uint32_t b= LD32(pixels+1);\
1057
        uint32_t l0=  (a&0x03030303UL)\
1058
                    + (b&0x03030303UL)\
1059
                    + 0x01010101UL;\
1060
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061
                   + ((b&0xFCFCFCFCUL)>>2);\
1062
        uint32_t l1,h1;\
1063
\
1064
        pixels+=line_size;\
1065
        for(i=0; i<h; i+=2){\
1066
            uint32_t a= LD32(pixels  );\
1067
            uint32_t b= LD32(pixels+1);\
1068
            l1=  (a&0x03030303UL)\
1069
               + (b&0x03030303UL);\
1070
            h1= ((a&0xFCFCFCFCUL)>>2)\
1071
              + ((b&0xFCFCFCFCUL)>>2);\
1072
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073
            pixels+=line_size;\
1074
            block +=line_size;\
1075
            a= LD32(pixels  );\
1076
            b= LD32(pixels+1);\
1077
            l0=  (a&0x03030303UL)\
1078
               + (b&0x03030303UL)\
1079
               + 0x01010101UL;\
1080
            h0= ((a&0xFCFCFCFCUL)>>2)\
1081
              + ((b&0xFCFCFCFCUL)>>2);\
1082
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083
            pixels+=line_size;\
1084
            block +=line_size;\
1085
        }\
1086
        pixels+=4-line_size*(h+1);\
1087
        block +=4-line_size*h;\
1088
    }\
1089
}\
1090
\
1091
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1092
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1096
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1099

    
1100
#define op_avg(a, b) a = rnd_avg32(a, b)
1101
#endif
1102
#define op_put(a, b) a = b
1103

    
1104
PIXOP2(avg, op_avg)
1105
PIXOP2(put, op_put)
1106
#undef op_avg
1107
#undef op_put
1108

    
1109
#define avg2(a,b) ((a+b+1)>>1)
1110
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1111

    
1112
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1113
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1114
}
1115

    
1116
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1118
}
1119

    
1120
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1121
{
1122
    const int A=(16-x16)*(16-y16);
1123
    const int B=(   x16)*(16-y16);
1124
    const int C=(16-x16)*(   y16);
1125
    const int D=(   x16)*(   y16);
1126
    int i;
1127

    
1128
    for(i=0; i<h; i++)
1129
    {
1130
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1131
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1132
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1133
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1134
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1135
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1136
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1137
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1138
        dst+= stride;
1139
        src+= stride;
1140
    }
1141
}
1142

    
1143
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1144
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1145
{
1146
    int y, vx, vy;
1147
    const int s= 1<<shift;
1148

    
1149
    width--;
1150
    height--;
1151

    
1152
    for(y=0; y<h; y++){
1153
        int x;
1154

    
1155
        vx= ox;
1156
        vy= oy;
1157
        for(x=0; x<8; x++){ //XXX FIXME optimize
1158
            int src_x, src_y, frac_x, frac_y, index;
1159

    
1160
            src_x= vx>>16;
1161
            src_y= vy>>16;
1162
            frac_x= src_x&(s-1);
1163
            frac_y= src_y&(s-1);
1164
            src_x>>=shift;
1165
            src_y>>=shift;
1166

    
1167
            if((unsigned)src_x < width){
1168
                if((unsigned)src_y < height){
1169
                    index= src_x + src_y*stride;
1170
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1171
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1172
                                        + (  src[index+stride  ]*(s-frac_x)
1173
                                           + src[index+stride+1]*   frac_x )*   frac_y
1174
                                        + r)>>(shift*2);
1175
                }else{
1176
                    index= src_x + clip(src_y, 0, height)*stride;
1177
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1178
                                          + src[index       +1]*   frac_x )*s
1179
                                        + r)>>(shift*2);
1180
                }
1181
            }else{
1182
                if((unsigned)src_y < height){
1183
                    index= clip(src_x, 0, width) + src_y*stride;
1184
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1185
                                           + src[index+stride  ]*   frac_y )*s
1186
                                        + r)>>(shift*2);
1187
                }else{
1188
                    index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1189
                    dst[y*stride + x]=    src[index         ];
1190
                }
1191
            }
1192

    
1193
            vx+= dxx;
1194
            vy+= dyx;
1195
        }
1196
        ox += dxy;
1197
        oy += dyy;
1198
    }
1199
}
1200

    
1201
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202
    switch(width){
1203
    case 2: put_pixels2_c (dst, src, stride, height); break;
1204
    case 4: put_pixels4_c (dst, src, stride, height); break;
1205
    case 8: put_pixels8_c (dst, src, stride, height); break;
1206
    case 16:put_pixels16_c(dst, src, stride, height); break;
1207
    }
1208
}
1209

    
1210
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1211
    int i,j;
1212
    for (i=0; i < height; i++) {
1213
      for (j=0; j < width; j++) {
1214
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1215
      }
1216
      src += stride;
1217
      dst += stride;
1218
    }
1219
}
1220

    
1221
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222
    int i,j;
1223
    for (i=0; i < height; i++) {
1224
      for (j=0; j < width; j++) {
1225
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1226
      }
1227
      src += stride;
1228
      dst += stride;
1229
    }
1230
}
1231

    
1232
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1233
    int i,j;
1234
    for (i=0; i < height; i++) {
1235
      for (j=0; j < width; j++) {
1236
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1237
      }
1238
      src += stride;
1239
      dst += stride;
1240
    }
1241
}
1242

    
1243
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244
    int i,j;
1245
    for (i=0; i < height; i++) {
1246
      for (j=0; j < width; j++) {
1247
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1248
      }
1249
      src += stride;
1250
      dst += stride;
1251
    }
1252
}
1253

    
1254
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255
    int i,j;
1256
    for (i=0; i < height; i++) {
1257
      for (j=0; j < width; j++) {
1258
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1259
      }
1260
      src += stride;
1261
      dst += stride;
1262
    }
1263
}
1264

    
1265
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266
    int i,j;
1267
    for (i=0; i < height; i++) {
1268
      for (j=0; j < width; j++) {
1269
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1270
      }
1271
      src += stride;
1272
      dst += stride;
1273
    }
1274
}
1275

    
1276
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277
    int i,j;
1278
    for (i=0; i < height; i++) {
1279
      for (j=0; j < width; j++) {
1280
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1281
      }
1282
      src += stride;
1283
      dst += stride;
1284
    }
1285
}
1286

    
1287
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288
    int i,j;
1289
    for (i=0; i < height; i++) {
1290
      for (j=0; j < width; j++) {
1291
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1292
      }
1293
      src += stride;
1294
      dst += stride;
1295
    }
1296
}
1297

    
1298
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299
    switch(width){
1300
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1301
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1302
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1303
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1304
    }
1305
}
1306

    
1307
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308
    int i,j;
1309
    for (i=0; i < height; i++) {
1310
      for (j=0; j < width; j++) {
1311
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1312
      }
1313
      src += stride;
1314
      dst += stride;
1315
    }
1316
}
1317

    
1318
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319
    int i,j;
1320
    for (i=0; i < height; i++) {
1321
      for (j=0; j < width; j++) {
1322
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1323
      }
1324
      src += stride;
1325
      dst += stride;
1326
    }
1327
}
1328

    
1329
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1330
    int i,j;
1331
    for (i=0; i < height; i++) {
1332
      for (j=0; j < width; j++) {
1333
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1334
      }
1335
      src += stride;
1336
      dst += stride;
1337
    }
1338
}
1339

    
1340
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1341
    int i,j;
1342
    for (i=0; i < height; i++) {
1343
      for (j=0; j < width; j++) {
1344
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1345
      }
1346
      src += stride;
1347
      dst += stride;
1348
    }
1349
}
1350

    
1351
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352
    int i,j;
1353
    for (i=0; i < height; i++) {
1354
      for (j=0; j < width; j++) {
1355
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356
      }
1357
      src += stride;
1358
      dst += stride;
1359
    }
1360
}
1361

    
1362
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363
    int i,j;
1364
    for (i=0; i < height; i++) {
1365
      for (j=0; j < width; j++) {
1366
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1367
      }
1368
      src += stride;
1369
      dst += stride;
1370
    }
1371
}
1372

    
1373
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1374
    int i,j;
1375
    for (i=0; i < height; i++) {
1376
      for (j=0; j < width; j++) {
1377
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1378
      }
1379
      src += stride;
1380
      dst += stride;
1381
    }
1382
}
1383

    
1384
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1385
    int i,j;
1386
    for (i=0; i < height; i++) {
1387
      for (j=0; j < width; j++) {
1388
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389
      }
1390
      src += stride;
1391
      dst += stride;
1392
    }
1393
}
1394
#if 0
1395
#define TPEL_WIDTH(width)\
1396
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1397
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1399
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1414
#endif
1415

    
1416
#define H264_CHROMA_MC(OPNAME, OP)\
1417
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418
    const int A=(8-x)*(8-y);\
1419
    const int B=(  x)*(8-y);\
1420
    const int C=(8-x)*(  y);\
1421
    const int D=(  x)*(  y);\
1422
    int i;\
1423
    \
1424
    assert(x<8 && y<8 && x>=0 && y>=0);\
1425
\
1426
    for(i=0; i<h; i++)\
1427
    {\
1428
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1430
        dst+= stride;\
1431
        src+= stride;\
1432
    }\
1433
}\
1434
\
1435
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1436
    const int A=(8-x)*(8-y);\
1437
    const int B=(  x)*(8-y);\
1438
    const int C=(8-x)*(  y);\
1439
    const int D=(  x)*(  y);\
1440
    int i;\
1441
    \
1442
    assert(x<8 && y<8 && x>=0 && y>=0);\
1443
\
1444
    for(i=0; i<h; i++)\
1445
    {\
1446
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1449
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1450
        dst+= stride;\
1451
        src+= stride;\
1452
    }\
1453
}\
1454
\
1455
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1456
    const int A=(8-x)*(8-y);\
1457
    const int B=(  x)*(8-y);\
1458
    const int C=(8-x)*(  y);\
1459
    const int D=(  x)*(  y);\
1460
    int i;\
1461
    \
1462
    assert(x<8 && y<8 && x>=0 && y>=0);\
1463
\
1464
    for(i=0; i<h; i++)\
1465
    {\
1466
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1467
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1468
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1469
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1470
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1471
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1472
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1473
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1474
        dst+= stride;\
1475
        src+= stride;\
1476
    }\
1477
}
1478

    
1479
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480
#define op_put(a, b) a = (((b) + 32)>>6)
1481

    
1482
H264_CHROMA_MC(put_       , op_put)
1483
H264_CHROMA_MC(avg_       , op_avg)
1484
#undef op_avg
1485
#undef op_put
1486

    
1487
static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1488
{
1489
    int i;
1490
    for(i=0; i<h; i++)
1491
    {
1492
        ST16(dst   , LD16(src   ));
1493
        dst+=dstStride;
1494
        src+=srcStride;
1495
    }
1496
}
1497

    
1498
static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1499
{
1500
    int i;
1501
    for(i=0; i<h; i++)
1502
    {
1503
        ST32(dst   , LD32(src   ));
1504
        dst+=dstStride;
1505
        src+=srcStride;
1506
    }
1507
}
1508

    
1509
static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1510
{
1511
    int i;
1512
    for(i=0; i<h; i++)
1513
    {
1514
        ST32(dst   , LD32(src   ));
1515
        ST32(dst+4 , LD32(src+4 ));
1516
        dst+=dstStride;
1517
        src+=srcStride;
1518
    }
1519
}
1520

    
1521
static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1522
{
1523
    int i;
1524
    for(i=0; i<h; i++)
1525
    {
1526
        ST32(dst   , LD32(src   ));
1527
        ST32(dst+4 , LD32(src+4 ));
1528
        ST32(dst+8 , LD32(src+8 ));
1529
        ST32(dst+12, LD32(src+12));
1530
        dst+=dstStride;
1531
        src+=srcStride;
1532
    }
1533
}
1534

    
1535
static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1536
{
1537
    int i;
1538
    for(i=0; i<h; i++)
1539
    {
1540
        ST32(dst   , LD32(src   ));
1541
        ST32(dst+4 , LD32(src+4 ));
1542
        ST32(dst+8 , LD32(src+8 ));
1543
        ST32(dst+12, LD32(src+12));
1544
        dst[16]= src[16];
1545
        dst+=dstStride;
1546
        src+=srcStride;
1547
    }
1548
}
1549

    
1550
static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1551
{
1552
    int i;
1553
    for(i=0; i<h; i++)
1554
    {
1555
        ST32(dst   , LD32(src   ));
1556
        ST32(dst+4 , LD32(src+4 ));
1557
        dst[8]= src[8];
1558
        dst+=dstStride;
1559
        src+=srcStride;
1560
    }
1561
}
1562

    
1563

    
1564
#define QPEL_MC(r, OPNAME, RND, OP) \
1565
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1566
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1567
    int i;\
1568
    for(i=0; i<h; i++)\
1569
    {\
1570
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1578
        dst+=dstStride;\
1579
        src+=srcStride;\
1580
    }\
1581
}\
1582
\
1583
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1584
    const int w=8;\
1585
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1586
    int i;\
1587
    for(i=0; i<w; i++)\
1588
    {\
1589
        const int src0= src[0*srcStride];\
1590
        const int src1= src[1*srcStride];\
1591
        const int src2= src[2*srcStride];\
1592
        const int src3= src[3*srcStride];\
1593
        const int src4= src[4*srcStride];\
1594
        const int src5= src[5*srcStride];\
1595
        const int src6= src[6*srcStride];\
1596
        const int src7= src[7*srcStride];\
1597
        const int src8= src[8*srcStride];\
1598
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1599
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1600
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1601
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1602
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1603
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1604
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1605
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1606
        dst++;\
1607
        src++;\
1608
    }\
1609
}\
1610
\
1611
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1612
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1613
    int i;\
1614
    \
1615
    for(i=0; i<h; i++)\
1616
    {\
1617
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1633
        dst+=dstStride;\
1634
        src+=srcStride;\
1635
    }\
1636
}\
1637
\
1638
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1639
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1640
    int i;\
1641
    const int w=16;\
1642
    for(i=0; i<w; i++)\
1643
    {\
1644
        const int src0= src[0*srcStride];\
1645
        const int src1= src[1*srcStride];\
1646
        const int src2= src[2*srcStride];\
1647
        const int src3= src[3*srcStride];\
1648
        const int src4= src[4*srcStride];\
1649
        const int src5= src[5*srcStride];\
1650
        const int src6= src[6*srcStride];\
1651
        const int src7= src[7*srcStride];\
1652
        const int src8= src[8*srcStride];\
1653
        const int src9= src[9*srcStride];\
1654
        const int src10= src[10*srcStride];\
1655
        const int src11= src[11*srcStride];\
1656
        const int src12= src[12*srcStride];\
1657
        const int src13= src[13*srcStride];\
1658
        const int src14= src[14*srcStride];\
1659
        const int src15= src[15*srcStride];\
1660
        const int src16= src[16*srcStride];\
1661
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1662
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1663
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1664
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1665
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1666
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1667
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1668
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1669
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1670
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1671
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1672
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1673
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1674
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1675
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1676
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1677
        dst++;\
1678
        src++;\
1679
    }\
1680
}\
1681
\
1682
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1683
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1684
}\
1685
\
1686
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1687
    uint8_t half[64];\
1688
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1690
}\
1691
\
1692
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1693
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1694
}\
1695
\
1696
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1697
    uint8_t half[64];\
1698
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1700
}\
1701
\
1702
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1703
    uint8_t full[16*9];\
1704
    uint8_t half[64];\
1705
    copy_block9(full, src, 16, stride, 9);\
1706
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1707
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1708
}\
1709
\
1710
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1711
    uint8_t full[16*9];\
1712
    copy_block9(full, src, 16, stride, 9);\
1713
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1714
}\
1715
\
1716
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1717
    uint8_t full[16*9];\
1718
    uint8_t half[64];\
1719
    copy_block9(full, src, 16, stride, 9);\
1720
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1721
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1722
}\
1723
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724
    uint8_t full[16*9];\
1725
    uint8_t halfH[72];\
1726
    uint8_t halfV[64];\
1727
    uint8_t halfHV[64];\
1728
    copy_block9(full, src, 16, stride, 9);\
1729
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733
}\
1734
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1735
    uint8_t full[16*9];\
1736
    uint8_t halfH[72];\
1737
    uint8_t halfHV[64];\
1738
    copy_block9(full, src, 16, stride, 9);\
1739
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1743
}\
1744
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745
    uint8_t full[16*9];\
1746
    uint8_t halfH[72];\
1747
    uint8_t halfV[64];\
1748
    uint8_t halfHV[64];\
1749
    copy_block9(full, src, 16, stride, 9);\
1750
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1751
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754
}\
1755
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1756
    uint8_t full[16*9];\
1757
    uint8_t halfH[72];\
1758
    uint8_t halfHV[64];\
1759
    copy_block9(full, src, 16, stride, 9);\
1760
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1764
}\
1765
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1766
    uint8_t full[16*9];\
1767
    uint8_t halfH[72];\
1768
    uint8_t halfV[64];\
1769
    uint8_t halfHV[64];\
1770
    copy_block9(full, src, 16, stride, 9);\
1771
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1772
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1775
}\
1776
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1777
    uint8_t full[16*9];\
1778
    uint8_t halfH[72];\
1779
    uint8_t halfHV[64];\
1780
    copy_block9(full, src, 16, stride, 9);\
1781
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785
}\
1786
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1787
    uint8_t full[16*9];\
1788
    uint8_t halfH[72];\
1789
    uint8_t halfV[64];\
1790
    uint8_t halfHV[64];\
1791
    copy_block9(full, src, 16, stride, 9);\
1792
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1793
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1795
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1796
}\
1797
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1798
    uint8_t full[16*9];\
1799
    uint8_t halfH[72];\
1800
    uint8_t halfHV[64];\
1801
    copy_block9(full, src, 16, stride, 9);\
1802
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1806
}\
1807
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1808
    uint8_t halfH[72];\
1809
    uint8_t halfHV[64];\
1810
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1811
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1813
}\
1814
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1815
    uint8_t halfH[72];\
1816
    uint8_t halfHV[64];\
1817
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1818
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820
}\
1821
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822
    uint8_t full[16*9];\
1823
    uint8_t halfH[72];\
1824
    uint8_t halfV[64];\
1825
    uint8_t halfHV[64];\
1826
    copy_block9(full, src, 16, stride, 9);\
1827
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1831
}\
1832
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1833
    uint8_t full[16*9];\
1834
    uint8_t halfH[72];\
1835
    copy_block9(full, src, 16, stride, 9);\
1836
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1839
}\
1840
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1841
    uint8_t full[16*9];\
1842
    uint8_t halfH[72];\
1843
    uint8_t halfV[64];\
1844
    uint8_t halfHV[64];\
1845
    copy_block9(full, src, 16, stride, 9);\
1846
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1849
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1850
}\
1851
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1852
    uint8_t full[16*9];\
1853
    uint8_t halfH[72];\
1854
    copy_block9(full, src, 16, stride, 9);\
1855
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1858
}\
1859
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t halfH[72];\
1861
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1862
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863
}\
1864
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1865
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1866
}\
1867
\
1868
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1869
    uint8_t half[256];\
1870
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1872
}\
1873
\
1874
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1875
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1876
}\
1877
\
1878
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    uint8_t half[256];\
1880
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1882
}\
1883
\
1884
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1885
    uint8_t full[24*17];\
1886
    uint8_t half[256];\
1887
    copy_block17(full, src, 24, stride, 17);\
1888
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1889
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1890
}\
1891
\
1892
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[24*17];\
1894
    copy_block17(full, src, 24, stride, 17);\
1895
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1896
}\
1897
\
1898
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1899
    uint8_t full[24*17];\
1900
    uint8_t half[256];\
1901
    copy_block17(full, src, 24, stride, 17);\
1902
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1903
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1904
}\
1905
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906
    uint8_t full[24*17];\
1907
    uint8_t halfH[272];\
1908
    uint8_t halfV[256];\
1909
    uint8_t halfHV[256];\
1910
    copy_block17(full, src, 24, stride, 17);\
1911
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915
}\
1916
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1917
    uint8_t full[24*17];\
1918
    uint8_t halfH[272];\
1919
    uint8_t halfHV[256];\
1920
    copy_block17(full, src, 24, stride, 17);\
1921
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1925
}\
1926
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927
    uint8_t full[24*17];\
1928
    uint8_t halfH[272];\
1929
    uint8_t halfV[256];\
1930
    uint8_t halfHV[256];\
1931
    copy_block17(full, src, 24, stride, 17);\
1932
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1933
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936
}\
1937
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1938
    uint8_t full[24*17];\
1939
    uint8_t halfH[272];\
1940
    uint8_t halfHV[256];\
1941
    copy_block17(full, src, 24, stride, 17);\
1942
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1946
}\
1947
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1948
    uint8_t full[24*17];\
1949
    uint8_t halfH[272];\
1950
    uint8_t halfV[256];\
1951
    uint8_t halfHV[256];\
1952
    copy_block17(full, src, 24, stride, 17);\
1953
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1957
}\
1958
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1959
    uint8_t full[24*17];\
1960
    uint8_t halfH[272];\
1961
    uint8_t halfHV[256];\
1962
    copy_block17(full, src, 24, stride, 17);\
1963
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967
}\
1968
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1969
    uint8_t full[24*17];\
1970
    uint8_t halfH[272];\
1971
    uint8_t halfV[256];\
1972
    uint8_t halfHV[256];\
1973
    copy_block17(full, src, 24, stride, 17);\
1974
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1975
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1978
}\
1979
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1980
    uint8_t full[24*17];\
1981
    uint8_t halfH[272];\
1982
    uint8_t halfHV[256];\
1983
    copy_block17(full, src, 24, stride, 17);\
1984
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1988
}\
1989
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    uint8_t halfH[272];\
1991
    uint8_t halfHV[256];\
1992
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1993
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1995
}\
1996
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1997
    uint8_t halfH[272];\
1998
    uint8_t halfHV[256];\
1999
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2000
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002
}\
2003
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004
    uint8_t full[24*17];\
2005
    uint8_t halfH[272];\
2006
    uint8_t halfV[256];\
2007
    uint8_t halfHV[256];\
2008
    copy_block17(full, src, 24, stride, 17);\
2009
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2010
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2013
}\
2014
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2015
    uint8_t full[24*17];\
2016
    uint8_t halfH[272];\
2017
    copy_block17(full, src, 24, stride, 17);\
2018
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2021
}\
2022
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    uint8_t full[24*17];\
2024
    uint8_t halfH[272];\
2025
    uint8_t halfV[256];\
2026
    uint8_t halfHV[256];\
2027
    copy_block17(full, src, 24, stride, 17);\
2028
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2032
}\
2033
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2034
    uint8_t full[24*17];\
2035
    uint8_t halfH[272];\
2036
    copy_block17(full, src, 24, stride, 17);\
2037
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2040
}\
2041
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t halfH[272];\
2043
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2044
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045
}
2046

    
2047
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049
#define op_put(a, b) a = cm[((b) + 16)>>5]
2050
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2051

    
2052
QPEL_MC(0, put_       , _       , op_put)
2053
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054
QPEL_MC(0, avg_       , _       , op_avg)
2055
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2056
#undef op_avg
2057
#undef op_avg_no_rnd
2058
#undef op_put
2059
#undef op_put_no_rnd
2060

    
2061
#if 1
2062
#define H264_LOWPASS(OPNAME, OP, OP2) \
2063
static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2064
    const int h=2;\
2065
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2066
    int i;\
2067
    for(i=0; i<h; i++)\
2068
    {\
2069
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2070
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2071
        dst+=dstStride;\
2072
        src+=srcStride;\
2073
    }\
2074
}\
2075
\
2076
static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2077
    const int w=2;\
2078
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2079
    int i;\
2080
    for(i=0; i<w; i++)\
2081
    {\
2082
        const int srcB= src[-2*srcStride];\
2083
        const int srcA= src[-1*srcStride];\
2084
        const int src0= src[0 *srcStride];\
2085
        const int src1= src[1 *srcStride];\
2086
        const int src2= src[2 *srcStride];\
2087
        const int src3= src[3 *srcStride];\
2088
        const int src4= src[4 *srcStride];\
2089
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2090
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2091
        dst++;\
2092
        src++;\
2093
    }\
2094
}\
2095
\
2096
static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2097
    const int h=2;\
2098
    const int w=2;\
2099
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2100
    int i;\
2101
    src -= 2*srcStride;\
2102
    for(i=0; i<h+5; i++)\
2103
    {\
2104
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2105
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2106
        tmp+=tmpStride;\
2107
        src+=srcStride;\
2108
    }\
2109
    tmp -= tmpStride*(h+5-2);\
2110
    for(i=0; i<w; i++)\
2111
    {\
2112
        const int tmpB= tmp[-2*tmpStride];\
2113
        const int tmpA= tmp[-1*tmpStride];\
2114
        const int tmp0= tmp[0 *tmpStride];\
2115
        const int tmp1= tmp[1 *tmpStride];\
2116
        const int tmp2= tmp[2 *tmpStride];\
2117
        const int tmp3= tmp[3 *tmpStride];\
2118
        const int tmp4= tmp[4 *tmpStride];\
2119
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2121
        dst++;\
2122
        tmp++;\
2123
    }\
2124
}\
2125
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2126
    const int h=4;\
2127
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128
    int i;\
2129
    for(i=0; i<h; i++)\
2130
    {\
2131
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2132
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2133
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2134
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2135
        dst+=dstStride;\
2136
        src+=srcStride;\
2137
    }\
2138
}\
2139
\
2140
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2141
    const int w=4;\
2142
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2143
    int i;\
2144
    for(i=0; i<w; i++)\
2145
    {\
2146
        const int srcB= src[-2*srcStride];\
2147
        const int srcA= src[-1*srcStride];\
2148
        const int src0= src[0 *srcStride];\
2149
        const int src1= src[1 *srcStride];\
2150
        const int src2= src[2 *srcStride];\
2151
        const int src3= src[3 *srcStride];\
2152
        const int src4= src[4 *srcStride];\
2153
        const int src5= src[5 *srcStride];\
2154
        const int src6= src[6 *srcStride];\
2155
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2156
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2157
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2158
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2159
        dst++;\
2160
        src++;\
2161
    }\
2162
}\
2163
\
2164
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2165
    const int h=4;\
2166
    const int w=4;\
2167
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2168
    int i;\
2169
    src -= 2*srcStride;\
2170
    for(i=0; i<h+5; i++)\
2171
    {\
2172
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2173
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2174
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2175
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2176
        tmp+=tmpStride;\
2177
        src+=srcStride;\
2178
    }\
2179
    tmp -= tmpStride*(h+5-2);\
2180
    for(i=0; i<w; i++)\
2181
    {\
2182
        const int tmpB= tmp[-2*tmpStride];\
2183
        const int tmpA= tmp[-1*tmpStride];\
2184
        const int tmp0= tmp[0 *tmpStride];\
2185
        const int tmp1= tmp[1 *tmpStride];\
2186
        const int tmp2= tmp[2 *tmpStride];\
2187
        const int tmp3= tmp[3 *tmpStride];\
2188
        const int tmp4= tmp[4 *tmpStride];\
2189
        const int tmp5= tmp[5 *tmpStride];\
2190
        const int tmp6= tmp[6 *tmpStride];\
2191
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2195
        dst++;\
2196
        tmp++;\
2197
    }\
2198
}\
2199
\
2200
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2201
    const int h=8;\
2202
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2203
    int i;\
2204
    for(i=0; i<h; i++)\
2205
    {\
2206
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2207
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2208
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2209
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2210
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2211
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2212
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2213
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2214
        dst+=dstStride;\
2215
        src+=srcStride;\
2216
    }\
2217
}\
2218
\
2219
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220
    const int w=8;\
2221
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2222
    int i;\
2223
    for(i=0; i<w; i++)\
2224
    {\
2225
        const int srcB= src[-2*srcStride];\
2226
        const int srcA= src[-1*srcStride];\
2227
        const int src0= src[0 *srcStride];\
2228
        const int src1= src[1 *srcStride];\
2229
        const int src2= src[2 *srcStride];\
2230
        const int src3= src[3 *srcStride];\
2231
        const int src4= src[4 *srcStride];\
2232
        const int src5= src[5 *srcStride];\
2233
        const int src6= src[6 *srcStride];\
2234
        const int src7= src[7 *srcStride];\
2235
        const int src8= src[8 *srcStride];\
2236
        const int src9= src[9 *srcStride];\
2237
        const int src10=src[10*srcStride];\
2238
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2239
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2240
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2241
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2243
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2244
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2245
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2246
        dst++;\
2247
        src++;\
2248
    }\
2249
}\
2250
\
2251
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252
    const int h=8;\
2253
    const int w=8;\
2254
    uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2255
    int i;\
2256
    src -= 2*srcStride;\
2257
    for(i=0; i<h+5; i++)\
2258
    {\
2259
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2260
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2261
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2262
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2263
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2264
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2265
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2266
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2267
        tmp+=tmpStride;\
2268
        src+=srcStride;\
2269
    }\
2270
    tmp -= tmpStride*(h+5-2);\
2271
    for(i=0; i<w; i++)\
2272
    {\
2273
        const int tmpB= tmp[-2*tmpStride];\
2274
        const int tmpA= tmp[-1*tmpStride];\
2275
        const int tmp0= tmp[0 *tmpStride];\
2276
        const int tmp1= tmp[1 *tmpStride];\
2277
        const int tmp2= tmp[2 *tmpStride];\
2278
        const int tmp3= tmp[3 *tmpStride];\
2279
        const int tmp4= tmp[4 *tmpStride];\
2280
        const int tmp5= tmp[5 *tmpStride];\
2281
        const int tmp6= tmp[6 *tmpStride];\
2282
        const int tmp7= tmp[7 *tmpStride];\
2283
        const int tmp8= tmp[8 *tmpStride];\
2284
        const int tmp9= tmp[9 *tmpStride];\
2285
        const int tmp10=tmp[10*tmpStride];\
2286
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2294
        dst++;\
2295
        tmp++;\
2296
    }\
2297
}\
2298
\
2299
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2301
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302
    src += 8*srcStride;\
2303
    dst += 8*dstStride;\
2304
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2305
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306
}\
2307
\
2308
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2310
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311
    src += 8*srcStride;\
2312
    dst += 8*dstStride;\
2313
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2314
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315
}\
2316
\
2317
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2319
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320
    src += 8*srcStride;\
2321
    dst += 8*dstStride;\
2322
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2323
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324
}\
2325

    
2326
#define H264_MC(OPNAME, SIZE) \
2327
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2328
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2329
}\
2330
\
2331
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2332
    uint8_t half[SIZE*SIZE];\
2333
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2335
}\
2336
\
2337
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2338
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2339
}\
2340
\
2341
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2342
    uint8_t half[SIZE*SIZE];\
2343
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2345
}\
2346
\
2347
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2348
    uint8_t full[SIZE*(SIZE+5)];\
2349
    uint8_t * const full_mid= full + SIZE*2;\
2350
    uint8_t half[SIZE*SIZE];\
2351
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2352
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2354
}\
2355
\
2356
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2357
    uint8_t full[SIZE*(SIZE+5)];\
2358
    uint8_t * const full_mid= full + SIZE*2;\
2359
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2360
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2361
}\
2362
\
2363
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2364
    uint8_t full[SIZE*(SIZE+5)];\
2365
    uint8_t * const full_mid= full + SIZE*2;\
2366
    uint8_t half[SIZE*SIZE];\
2367
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2368
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2370
}\
2371
\
2372
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2373
    uint8_t full[SIZE*(SIZE+5)];\
2374
    uint8_t * const full_mid= full + SIZE*2;\
2375
    uint8_t halfH[SIZE*SIZE];\
2376
    uint8_t halfV[SIZE*SIZE];\
2377
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2379
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2381
}\
2382
\
2383
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2384
    uint8_t full[SIZE*(SIZE+5)];\
2385
    uint8_t * const full_mid= full + SIZE*2;\
2386
    uint8_t halfH[SIZE*SIZE];\
2387
    uint8_t halfV[SIZE*SIZE];\
2388
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2390
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2392
}\
2393
\
2394
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2395
    uint8_t full[SIZE*(SIZE+5)];\
2396
    uint8_t * const full_mid= full + SIZE*2;\
2397
    uint8_t halfH[SIZE*SIZE];\
2398
    uint8_t halfV[SIZE*SIZE];\
2399
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2401
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2403
}\
2404
\
2405
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2406
    uint8_t full[SIZE*(SIZE+5)];\
2407
    uint8_t * const full_mid= full + SIZE*2;\
2408
    uint8_t halfH[SIZE*SIZE];\
2409
    uint8_t halfV[SIZE*SIZE];\
2410
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2412
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2414
}\
2415
\
2416
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2417
    int16_t tmp[SIZE*(SIZE+5)];\
2418
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2419
}\
2420
\
2421
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2422
    int16_t tmp[SIZE*(SIZE+5)];\
2423
    uint8_t halfH[SIZE*SIZE];\
2424
    uint8_t halfHV[SIZE*SIZE];\
2425
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2428
}\
2429
\
2430
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2431
    int16_t tmp[SIZE*(SIZE+5)];\
2432
    uint8_t halfH[SIZE*SIZE];\
2433
    uint8_t halfHV[SIZE*SIZE];\
2434
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437
}\
2438
\
2439
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2440
    uint8_t full[SIZE*(SIZE+5)];\
2441
    uint8_t * const full_mid= full + SIZE*2;\
2442
    int16_t tmp[SIZE*(SIZE+5)];\
2443
    uint8_t halfV[SIZE*SIZE];\
2444
    uint8_t halfHV[SIZE*SIZE];\
2445
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2449
}\
2450
\
2451
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2452
    uint8_t full[SIZE*(SIZE+5)];\
2453
    uint8_t * const full_mid= full + SIZE*2;\
2454
    int16_t tmp[SIZE*(SIZE+5)];\
2455
    uint8_t halfV[SIZE*SIZE];\
2456
    uint8_t halfHV[SIZE*SIZE];\
2457
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2458
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2461
}\
2462

    
2463
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2465
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2466
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2468

    
2469
H264_LOWPASS(put_       , op_put, op2_put)
2470
H264_LOWPASS(avg_       , op_avg, op2_avg)
2471
H264_MC(put_, 2)
2472
H264_MC(put_, 4)
2473
H264_MC(put_, 8)
2474
H264_MC(put_, 16)
2475
H264_MC(avg_, 4)
2476
H264_MC(avg_, 8)
2477
H264_MC(avg_, 16)
2478

    
2479
#undef op_avg
2480
#undef op_put
2481
#undef op2_avg
2482
#undef op2_put
2483
#endif
2484

    
2485
#define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486
#define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2487
#define H264_WEIGHT(W,H) \
2488
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2489
    int y; \
2490
    offset <<= log2_denom; \
2491
    if(log2_denom) offset += 1<<(log2_denom-1); \
2492
    for(y=0; y<H; y++, block += stride){ \
2493
        op_scale1(0); \
2494
        op_scale1(1); \
2495
        if(W==2) continue; \
2496
        op_scale1(2); \
2497
        op_scale1(3); \
2498
        if(W==4) continue; \
2499
        op_scale1(4); \
2500
        op_scale1(5); \
2501
        op_scale1(6); \
2502
        op_scale1(7); \
2503
        if(W==8) continue; \
2504
        op_scale1(8); \
2505
        op_scale1(9); \
2506
        op_scale1(10); \
2507
        op_scale1(11); \
2508
        op_scale1(12); \
2509
        op_scale1(13); \
2510
        op_scale1(14); \
2511
        op_scale1(15); \
2512
    } \
2513
} \
2514
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2515
    int y; \
2516
    offset = ((offset + 1) | 1) << log2_denom; \
2517
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2518
        op_scale2(0); \
2519
        op_scale2(1); \
2520
        if(W==2) continue; \
2521
        op_scale2(2); \
2522
        op_scale2(3); \
2523
        if(W==4) continue; \
2524
        op_scale2(4); \
2525
        op_scale2(5); \
2526
        op_scale2(6); \
2527
        op_scale2(7); \
2528
        if(W==8) continue; \
2529
        op_scale2(8); \
2530
        op_scale2(9); \
2531
        op_scale2(10); \
2532
        op_scale2(11); \
2533
        op_scale2(12); \
2534
        op_scale2(13); \
2535
        op_scale2(14); \
2536
        op_scale2(15); \
2537
    } \
2538
}
2539

    
2540
H264_WEIGHT(16,16)
2541
H264_WEIGHT(16,8)
2542
H264_WEIGHT(8,16)
2543
H264_WEIGHT(8,8)
2544
H264_WEIGHT(8,4)
2545
H264_WEIGHT(4,8)
2546
H264_WEIGHT(4,4)
2547
H264_WEIGHT(4,2)
2548
H264_WEIGHT(2,4)
2549
H264_WEIGHT(2,2)
2550

    
2551
#undef op_scale1
2552
#undef op_scale2
2553
#undef H264_WEIGHT
2554

    
2555
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2556
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2557
    int i;
2558

    
2559
    for(i=0; i<h; i++){
2560
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2568
        dst+=dstStride;
2569
        src+=srcStride;
2570
    }
2571
}
2572

    
2573
/* AVS specific */
2574
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2575

    
2576
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2577
    put_pixels8_c(dst, src, stride, 8);
2578
}
2579
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580
    avg_pixels8_c(dst, src, stride, 8);
2581
}
2582
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583
    put_pixels16_c(dst, src, stride, 16);
2584
}
2585
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586
    avg_pixels16_c(dst, src, stride, 16);
2587
}
2588

    
2589
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2590
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
2591
    int i;
2592

    
2593
    for(i=0; i<w; i++){
2594
        const int src_1= src[ -srcStride];
2595
        const int src0 = src[0          ];
2596
        const int src1 = src[  srcStride];
2597
        const int src2 = src[2*srcStride];
2598
        const int src3 = src[3*srcStride];
2599
        const int src4 = src[4*srcStride];
2600
        const int src5 = src[5*srcStride];
2601
        const int src6 = src[6*srcStride];
2602
        const int src7 = src[7*srcStride];
2603
        const int src8 = src[8*srcStride];
2604
        const int src9 = src[9*srcStride];
2605
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2606
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2607
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2608
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2609
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2610
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2611
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2612
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2613
        src++;
2614
        dst++;
2615
    }
2616
}
2617

    
2618
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2619
    put_pixels8_c(dst, src, stride, 8);
2620
}
2621

    
2622
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2623
    uint8_t half[64];
2624
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2625
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2626
}
2627

    
2628
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2629
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2630
}
2631

    
2632
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2633
    uint8_t half[64];
2634
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2635
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2636
}
2637

    
2638
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2639
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2640
}
2641

    
2642
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2643
    uint8_t halfH[88];
2644
    uint8_t halfV[64];
2645
    uint8_t halfHV[64];
2646
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2647
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2648
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2649
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2650
}
2651
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2652
    uint8_t halfH[88];
2653
    uint8_t halfV[64];
2654
    uint8_t halfHV[64];
2655
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2657
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2658
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2659
}
2660
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2661
    uint8_t halfH[88];
2662
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2663
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2664
}
2665

    
2666
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2667
    int x;
2668
    const int strength= ff_h263_loop_filter_strength[qscale];
2669

    
2670
    for(x=0; x<8; x++){
2671
        int d1, d2, ad1;
2672
        int p0= src[x-2*stride];
2673
        int p1= src[x-1*stride];
2674
        int p2= src[x+0*stride];
2675
        int p3= src[x+1*stride];
2676
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2677

    
2678
        if     (d<-2*strength) d1= 0;
2679
        else if(d<-  strength) d1=-2*strength - d;
2680
        else if(d<   strength) d1= d;
2681
        else if(d< 2*strength) d1= 2*strength - d;
2682
        else                   d1= 0;
2683

    
2684
        p1 += d1;
2685
        p2 -= d1;
2686
        if(p1&256) p1= ~(p1>>31);
2687
        if(p2&256) p2= ~(p2>>31);
2688

    
2689
        src[x-1*stride] = p1;
2690
        src[x+0*stride] = p2;
2691

    
2692
        ad1= ABS(d1)>>1;
2693

    
2694
        d2= clip((p0-p3)/4, -ad1, ad1);
2695

    
2696
        src[x-2*stride] = p0 - d2;
2697
        src[x+  stride] = p3 + d2;
2698
    }
2699
}
2700

    
2701
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2702
    int y;
2703
    const int strength= ff_h263_loop_filter_strength[qscale];
2704

    
2705
    for(y=0; y<8; y++){
2706
        int d1, d2, ad1;
2707
        int p0= src[y*stride-2];
2708
        int p1= src[y*stride-1];
2709
        int p2= src[y*stride+0];
2710
        int p3= src[y*stride+1];
2711
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2712

    
2713
        if     (d<-2*strength) d1= 0;
2714
        else if(d<-  strength) d1=-2*strength - d;
2715
        else if(d<   strength) d1= d;
2716
        else if(d< 2*strength) d1= 2*strength - d;
2717
        else                   d1= 0;
2718

    
2719
        p1 += d1;
2720
        p2 -= d1;
2721
        if(p1&256) p1= ~(p1>>31);
2722
        if(p2&256) p2= ~(p2>>31);
2723

    
2724
        src[y*stride-1] = p1;
2725
        src[y*stride+0] = p2;
2726

    
2727
        ad1= ABS(d1)>>1;
2728

    
2729
        d2= clip((p0-p3)/4, -ad1, ad1);
2730

    
2731
        src[y*stride-2] = p0 - d2;
2732
        src[y*stride+1] = p3 + d2;
2733
    }
2734
}
2735

    
2736
static void h261_loop_filter_c(uint8_t *src, int stride){
2737
    int x,y,xy,yz;
2738
    int temp[64];
2739

    
2740
    for(x=0; x<8; x++){
2741
        temp[x      ] = 4*src[x           ];
2742
        temp[x + 7*8] = 4*src[x + 7*stride];
2743
    }
2744
    for(y=1; y<7; y++){
2745
        for(x=0; x<8; x++){
2746
            xy = y * stride + x;
2747
            yz = y * 8 + x;
2748
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2749
        }
2750
    }
2751

    
2752
    for(y=0; y<8; y++){
2753
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2754
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2755
        for(x=1; x<7; x++){
2756
            xy = y * stride + x;
2757
            yz = y * 8 + x;
2758
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2759
        }
2760
    }
2761
}
2762

    
2763
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2764
{
2765
    int i, d;
2766
    for( i = 0; i < 4; i++ ) {
2767
        if( tc0[i] < 0 ) {
2768
            pix += 4*ystride;
2769
            continue;
2770
        }
2771
        for( d = 0; d < 4; d++ ) {
2772
            const int p0 = pix[-1*xstride];
2773
            const int p1 = pix[-2*xstride];
2774
            const int p2 = pix[-3*xstride];
2775
            const int q0 = pix[0];
2776
            const int q1 = pix[1*xstride];
2777
            const int q2 = pix[2*xstride];
2778

    
2779
            if( ABS( p0 - q0 ) < alpha &&
2780
                ABS( p1 - p0 ) < beta &&
2781
                ABS( q1 - q0 ) < beta ) {
2782

    
2783
                int tc = tc0[i];
2784
                int i_delta;
2785

    
2786
                if( ABS( p2 - p0 ) < beta ) {
2787
                    pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2788
                    tc++;
2789
                }
2790
                if( ABS( q2 - q0 ) < beta ) {
2791
                    pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2792
                    tc++;
2793
                }
2794

    
2795
                i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2796
                pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2797
                pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2798
            }
2799
            pix += ystride;
2800
        }
2801
    }
2802
}
2803
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2804
{
2805
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2806
}
2807
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2808
{
2809
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2810
}
2811

    
2812
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2813
{
2814
    int i, d;
2815
    for( i = 0; i < 4; i++ ) {
2816
        const int tc = tc0[i];
2817
        if( tc <= 0 ) {
2818
            pix += 2*ystride;
2819
            continue;
2820
        }
2821
        for( d = 0; d < 2; d++ ) {
2822
            const int p0 = pix[-1*xstride];
2823
            const int p1 = pix[-2*xstride];
2824
            const int q0 = pix[0];
2825
            const int q1 = pix[1*xstride];
2826

    
2827
            if( ABS( p0 - q0 ) < alpha &&
2828
                ABS( p1 - p0 ) < beta &&
2829
                ABS( q1 - q0 ) < beta ) {
2830

    
2831
                int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2832

    
2833
                pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2834
                pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2835
            }
2836
            pix += ystride;
2837
        }
2838
    }
2839
}
2840
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2841
{
2842
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2843
}
2844
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2845
{
2846
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2847
}
2848

    
2849
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2850
{
2851
    int d;
2852
    for( d = 0; d < 8; d++ ) {
2853
        const int p0 = pix[-1*xstride];
2854
        const int p1 = pix[-2*xstride];
2855
        const int q0 = pix[0];
2856
        const int q1 = pix[1*xstride];
2857

    
2858
        if( ABS( p0 - q0 ) < alpha &&
2859
            ABS( p1 - p0 ) < beta &&
2860
            ABS( q1 - q0 ) < beta ) {
2861

    
2862
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2863
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2864
        }
2865
        pix += ystride;
2866
    }
2867
}
2868
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2869
{
2870
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2871
}
2872
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2873
{
2874
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2875
}
2876

    
2877
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2878
{
2879
    int s, i;
2880

    
2881
    s = 0;
2882
    for(i=0;i<h;i++) {
2883
        s += abs(pix1[0] - pix2[0]);
2884
        s += abs(pix1[1] - pix2[1]);
2885
        s += abs(pix1[2] - pix2[2]);
2886
        s += abs(pix1[3] - pix2[3]);
2887
        s += abs(pix1[4] - pix2[4]);
2888
        s += abs(pix1[5] - pix2[5]);
2889
        s += abs(pix1[6] - pix2[6]);
2890
        s += abs(pix1[7] - pix2[7]);
2891
        s += abs(pix1[8] - pix2[8]);
2892
        s += abs(pix1[9] - pix2[9]);
2893
        s += abs(pix1[10] - pix2[10]);
2894
        s += abs(pix1[11] - pix2[11]);
2895
        s += abs(pix1[12] - pix2[12]);
2896
        s += abs(pix1[13] - pix2[13]);
2897
        s += abs(pix1[14] - pix2[14]);
2898
        s += abs(pix1[15] - pix2[15]);
2899
        pix1 += line_size;
2900
        pix2 += line_size;
2901
    }
2902
    return s;
2903
}
2904

    
2905
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2906
{
2907
    int s, i;
2908

    
2909
    s = 0;
2910
    for(i=0;i<h;i++) {
2911
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2912
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2913
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2914
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2915
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2916
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2917
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2918
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2919
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2920
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2921
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2922
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2923
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2924
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2925
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2926
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2927
        pix1 += line_size;
2928
        pix2 += line_size;
2929
    }
2930
    return s;
2931
}
2932

    
2933
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934
{
2935
    int s, i;
2936
    uint8_t *pix3 = pix2 + line_size;
2937

    
2938
    s = 0;
2939
    for(i=0;i<h;i++) {
2940
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2941
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2942
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2943
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2944
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2945
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2946
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2947
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2948
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2949
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2950
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2951
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2952
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2953
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2954
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2955
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2956
        pix1 += line_size;
2957
        pix2 += line_size;
2958
        pix3 += line_size;
2959
    }
2960
    return s;
2961
}
2962

    
2963
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2964
{
2965
    int s, i;
2966
    uint8_t *pix3 = pix2 + line_size;
2967

    
2968
    s = 0;
2969
    for(i=0;i<h;i++) {
2970
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2971
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2972
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2973
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2974
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2975
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2976
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2977
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2978
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2979
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2980
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2981
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2982
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2983
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2984
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2985
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2986
        pix1 += line_size;
2987
        pix2 += line_size;
2988
        pix3 += line_size;
2989
    }
2990
    return s;
2991
}
2992

    
2993
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994
{
2995
    int s, i;
2996

    
2997
    s = 0;
2998
    for(i=0;i<h;i++) {
2999
        s += abs(pix1[0] - pix2[0]);
3000
        s += abs(pix1[1] - pix2[1]);
3001
        s += abs(pix1[2] - pix2[2]);
3002
        s += abs(pix1[3] - pix2[3]);
3003
        s += abs(pix1[4] - pix2[4]);
3004
        s += abs(pix1[5] - pix2[5]);
3005
        s += abs(pix1[6] - pix2[6]);
3006
        s += abs(pix1[7] - pix2[7]);
3007
        pix1 += line_size;
3008
        pix2 += line_size;
3009
    }
3010
    return s;
3011
}
3012

    
3013
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3014
{
3015
    int s, i;
3016

    
3017
    s = 0;
3018
    for(i=0;i<h;i++) {
3019
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3020
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3021
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3022
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3023
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3024
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3025
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3026
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3027
        pix1 += line_size;
3028
        pix2 += line_size;
3029
    }
3030
    return s;
3031
}
3032

    
3033
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3034
{
3035
    int s, i;
3036
    uint8_t *pix3 = pix2 + line_size;
3037

    
3038
    s = 0;
3039
    for(i=0;i<h;i++) {
3040
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3041
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3042
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3043
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3044
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3045
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3046
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3047
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3048
        pix1 += line_size;
3049
        pix2 += line_size;
3050
        pix3 += line_size;
3051
    }
3052
    return s;
3053
}
3054

    
3055
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3056
{
3057
    int s, i;
3058
    uint8_t *pix3 = pix2 + line_size;
3059

    
3060
    s = 0;
3061
    for(i=0;i<h;i++) {
3062
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3063
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3064
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3065
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3066
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3067
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3068
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3069
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3070
        pix1 += line_size;
3071
        pix2 += line_size;
3072
        pix3 += line_size;
3073
    }
3074
    return s;
3075
}
3076

    
3077
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3078
    MpegEncContext *c = v;
3079
    int score1=0;
3080
    int score2=0;
3081
    int x,y;
3082

    
3083
    for(y=0; y<h; y++){
3084
        for(x=0; x<16; x++){
3085
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3086
        }
3087
        if(y+1<h){
3088
            for(x=0; x<15; x++){
3089
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3090
                             - s1[x+1] + s1[x+1+stride])
3091
                        -ABS(  s2[x  ] - s2[x  +stride]
3092
                             - s2[x+1] + s2[x+1+stride]);
3093
            }
3094
        }
3095
        s1+= stride;
3096
        s2+= stride;
3097
    }
3098

    
3099
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3100
    else  return score1 + ABS(score2)*8;
3101
}
3102

    
3103
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3104
    MpegEncContext *c = v;
3105
    int score1=0;
3106
    int score2=0;
3107
    int x,y;
3108

    
3109
    for(y=0; y<h; y++){
3110
        for(x=0; x<8; x++){
3111
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3112
        }
3113
        if(y+1<h){
3114
            for(x=0; x<7; x++){
3115
                score2+= ABS(  s1[x  ] - s1[x  +stride]
3116
                             - s1[x+1] + s1[x+1+stride])
3117
                        -ABS(  s2[x  ] - s2[x  +stride]
3118
                             - s2[x+1] + s2[x+1+stride]);
3119
            }
3120
        }
3121
        s1+= stride;
3122
        s2+= stride;
3123
    }
3124

    
3125
    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3126
    else  return score1 + ABS(score2)*8;
3127
}
3128

    
3129
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3130
    int i;
3131
    unsigned int sum=0;
3132

    
3133
    for(i=0; i<8*8; i++){
3134
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3135
        int w= weight[i];
3136
        b>>= RECON_SHIFT;
3137
        assert(-512<b && b<512);
3138

    
3139
        sum += (w*b)*(w*b)>>4;
3140
    }
3141
    return sum>>2;
3142
}
3143

    
3144
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3145
    int i;
3146

    
3147
    for(i=0; i<8*8; i++){
3148
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3149
    }
3150
}
3151

    
3152
/**
3153
 * permutes an 8x8 block.
3154
 * @param block the block which will be permuted according to the given permutation vector
3155
 * @param permutation the permutation vector
3156
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3157
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3158
 *                  (inverse) permutated to scantable order!
3159
 */
3160
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3161
{
3162
    int i;
3163
    DCTELEM temp[64];
3164

    
3165
    if(last<=0) return;
3166
    //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3167

    
3168
    for(i=0; i<=last; i++){
3169
        const int j= scantable[i];
3170
        temp[j]= block[j];
3171
        block[j]=0;
3172
    }
3173

    
3174
    for(i=0; i<=last; i++){
3175
        const int j= scantable[i];
3176
        const int perm_j= permutation[j];
3177
        block[perm_j]= temp[j];
3178
    }
3179
}
3180

    
3181
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3182
    return 0;
3183
}
3184

    
3185
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3186
    int i;
3187

    
3188
    memset(cmp, 0, sizeof(void*)*5);
3189

    
3190
    for(i=0; i<5; i++){
3191
        switch(type&0xFF){
3192
        case FF_CMP_SAD:
3193
            cmp[i]= c->sad[i];
3194
            break;
3195
        case FF_CMP_SATD:
3196
            cmp[i]= c->hadamard8_diff[i];
3197
            break;
3198
        case FF_CMP_SSE:
3199
            cmp[i]= c->sse[i];
3200
            break;
3201
        case FF_CMP_DCT:
3202
            cmp[i]= c->dct_sad[i];
3203
            break;
3204
        case FF_CMP_DCT264:
3205
            cmp[i]= c->dct264_sad[i];
3206
            break;
3207
        case FF_CMP_DCTMAX:
3208
            cmp[i]= c->dct_max[i];
3209
            break;
3210
        case FF_CMP_PSNR:
3211
            cmp[i]= c->quant_psnr[i];
3212
            break;
3213
        case FF_CMP_BIT:
3214
            cmp[i]= c->bit[i];
3215
            break;
3216
        case FF_CMP_RD:
3217
            cmp[i]= c->rd[i];
3218
            break;
3219
        case FF_CMP_VSAD:
3220
            cmp[i]= c->vsad[i];
3221
            break;
3222
        case FF_CMP_VSSE:
3223
            cmp[i]= c->vsse[i];
3224
            break;
3225
        case FF_CMP_ZERO:
3226
            cmp[i]= zero_cmp;
3227
            break;
3228
        case FF_CMP_NSSE:
3229
            cmp[i]= c->nsse[i];
3230
            break;
3231
#ifdef CONFIG_SNOW_ENCODER
3232
        case FF_CMP_W53:
3233
            cmp[i]= c->w53[i];
3234
            break;
3235
        case FF_CMP_W97:
3236
            cmp[i]= c->w97[i];
3237
            break;
3238
#endif
3239
        default:
3240
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3241
        }
3242
    }
3243
}
3244

    
3245
/**
3246
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3247
 */
3248
static void clear_blocks_c(DCTELEM *blocks)
3249
{
3250
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3251
}
3252

    
3253
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3254
    int i;
3255
    for(i=0; i+7<w; i+=8){
3256
        dst[i+0] += src[i+0];
3257
        dst[i+1] += src[i+1];
3258
        dst[i+2] += src[i+2];
3259
        dst[i+3] += src[i+3];
3260
        dst[i+4] += src[i+4];
3261
        dst[i+5] += src[i+5];
3262
        dst[i+6] += src[i+6];
3263
        dst[i+7] += src[i+7];
3264
    }
3265
    for(; i<w; i++)
3266
        dst[i+0] += src[i+0];
3267
}
3268

    
3269
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3270
    int i;
3271
    for(i=0; i+7<w; i+=8){
3272
        dst[i+0] = src1[i+0]-src2[i+0];
3273
        dst[i+1] = src1[i+1]-src2[i+1];
3274
        dst[i+2] = src1[i+2]-src2[i+2];
3275
        dst[i+3] = src1[i+3]-src2[i+3];
3276
        dst[i+4] = src1[i+4]-src2[i+4];
3277
        dst[i+5] = src1[i+5]-src2[i+5];
3278
        dst[i+6] = src1[i+6]-src2[i+6];
3279
        dst[i+7] = src1[i+7]-src2[i+7];
3280
    }
3281
    for(; i<w; i++)
3282
        dst[i+0] = src1[i+0]-src2[i+0];
3283
}
3284

    
3285
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3286
    int i;
3287
    uint8_t l, lt;
3288

    
3289
    l= *left;
3290
    lt= *left_top;
3291

    
3292
    for(i=0; i<w; i++){
3293
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3294
        lt= src1[i];
3295
        l= src2[i];
3296
        dst[i]= l - pred;
3297
    }
3298

    
3299
    *left= l;
3300
    *left_top= lt;
3301
}
3302

    
3303
#define BUTTERFLY2(o1,o2,i1,i2) \
3304
o1= (i1)+(i2);\
3305
o2= (i1)-(i2);
3306

    
3307
#define BUTTERFLY1(x,y) \
3308
{\
3309
    int a,b;\
3310
    a= x;\
3311
    b= y;\
3312
    x= a+b;\
3313
    y= a-b;\
3314
}
3315

    
3316
#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3317

    
3318
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3319
    int i;
3320
    int temp[64];
3321
    int sum=0;
3322

    
3323
    assert(h==8);
3324

    
3325
    for(i=0; i<8; i++){
3326
        //FIXME try pointer walks
3327
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3328
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3329
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3330
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3331

    
3332
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3333
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3334
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3335
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3336

    
3337
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3338
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3339
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3340
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3341
    }
3342

    
3343
    for(i=0; i<8; i++){
3344
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3345
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3346
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3347
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3348

    
3349
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3350
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3351
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3352
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3353

    
3354
        sum +=
3355
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3356
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3357
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3358
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3359
    }
3360
#if 0
3361
static int maxi=0;
3362
if(sum>maxi){
3363
    maxi=sum;
3364
    printf("MAX:%d\n", maxi);
3365
}
3366
#endif
3367
    return sum;
3368
}
3369

    
3370
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3371
    int i;
3372
    int temp[64];
3373
    int sum=0;
3374

    
3375
    assert(h==8);
3376

    
3377
    for(i=0; i<8; i++){
3378
        //FIXME try pointer walks
3379
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3380
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3381
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3382
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3383

    
3384
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3385
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3386
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3387
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3388

    
3389
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3390
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3391
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3392
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3393
    }
3394

    
3395
    for(i=0; i<8; i++){
3396
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3397
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3398
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3399
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3400

    
3401
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3402
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3403
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3404
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3405

    
3406
        sum +=
3407
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3408
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3409
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3410
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3411
    }
3412

    
3413
    sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3414

    
3415
    return sum;
3416
}
3417

    
3418
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3419
    MpegEncContext * const s= (MpegEncContext *)c;
3420
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3421
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3422
    int sum=0, i;
3423

    
3424
    assert(h==8);
3425

    
3426
    s->dsp.diff_pixels(temp, src1, src2, stride);
3427
    s->dsp.fdct(temp);
3428

    
3429
    for(i=0; i<64; i++)
3430
        sum+= ABS(temp[i]);
3431

    
3432
    return sum;
3433
}
3434

    
3435
#ifdef CONFIG_GPL
3436
#define DCT8_1D {\
3437
    const int s07 = SRC(0) + SRC(7);\
3438
    const int s16 = SRC(1) + SRC(6);\
3439
    const int s25 = SRC(2) + SRC(5);\
3440
    const int s34 = SRC(3) + SRC(4);\
3441
    const int a0 = s07 + s34;\
3442
    const int a1 = s16 + s25;\
3443
    const int a2 = s07 - s34;\
3444
    const int a3 = s16 - s25;\
3445
    const int d07 = SRC(0) - SRC(7);\
3446
    const int d16 = SRC(1) - SRC(6);\
3447
    const int d25 = SRC(2) - SRC(5);\
3448
    const int d34 = SRC(3) - SRC(4);\
3449
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3450
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3451
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3452
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3453
    DST(0,  a0 + a1     ) ;\
3454
    DST(1,  a4 + (a7>>2)) ;\
3455
    DST(2,  a2 + (a3>>1)) ;\
3456
    DST(3,  a5 + (a6>>2)) ;\
3457
    DST(4,  a0 - a1     ) ;\
3458
    DST(5,  a6 - (a5>>2)) ;\
3459
    DST(6, (a2>>1) - a3 ) ;\
3460
    DST(7, (a4>>2) - a7 ) ;\
3461
}
3462

    
3463
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3464
    MpegEncContext * const s= (MpegEncContext *)c;
3465
    int16_t dct[8][8];
3466
    int i;
3467
    int sum=0;
3468

    
3469
    s->dsp.diff_pixels(dct, src1, src2, stride);
3470

    
3471
#define SRC(x) dct[i][x]
3472
#define DST(x,v) dct[i][x]= v
3473
    for( i = 0; i < 8; i++ )
3474
        DCT8_1D
3475
#undef SRC
3476
#undef DST
3477

    
3478
#define SRC(x) dct[x][i]
3479
#define DST(x,v) sum += ABS(v)
3480
    for( i = 0; i < 8; i++ )
3481
        DCT8_1D
3482
#undef SRC
3483
#undef DST
3484
    return sum;
3485
}
3486
#endif
3487

    
3488
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3489
    MpegEncContext * const s= (MpegEncContext *)c;
3490
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3491
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3492
    int sum=0, i;
3493

    
3494
    assert(h==8);
3495

    
3496
    s->dsp.diff_pixels(temp, src1, src2, stride);
3497
    s->dsp.fdct(temp);
3498

    
3499
    for(i=0; i<64; i++)
3500
        sum= FFMAX(sum, ABS(temp[i]));
3501

    
3502
    return sum;
3503
}
3504

    
3505
void simple_idct(DCTELEM *block); //FIXME
3506

    
3507
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3508
    MpegEncContext * const s= (MpegEncContext *)c;
3509
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3510
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3511
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3512
    int sum=0, i;
3513

    
3514
    assert(h==8);
3515
    s->mb_intra=0;
3516

    
3517
    s->dsp.diff_pixels(temp, src1, src2, stride);
3518

    
3519
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3520

    
3521
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3523
    simple_idct(temp); //FIXME
3524

    
3525
    for(i=0; i<64; i++)
3526
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3527

    
3528
    return sum;
3529
}
3530

    
3531
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3532
    MpegEncContext * const s= (MpegEncContext *)c;
3533
    const uint8_t *scantable= s->intra_scantable.permutated;
3534
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3535
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3536
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3537
    uint8_t * const bak= (uint8_t*)aligned_bak;
3538
    int i, last, run, bits, level, distoration, start_i;
3539
    const int esc_length= s->ac_esc_length;
3540
    uint8_t * length;
3541
    uint8_t * last_length;
3542

    
3543
    assert(h==8);
3544

    
3545
    for(i=0; i<8; i++){
3546
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3547
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3548
    }
3549

    
3550
    s->dsp.diff_pixels(temp, src1, src2, stride);
3551

    
3552
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3553

    
3554
    bits=0;
3555

    
3556
    if (s->mb_intra) {
3557
        start_i = 1;
3558
        length     = s->intra_ac_vlc_length;
3559
        last_length= s->intra_ac_vlc_last_length;
3560
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3561
    } else {
3562
        start_i = 0;
3563
        length     = s->inter_ac_vlc_length;
3564
        last_length= s->inter_ac_vlc_last_length;
3565
    }
3566

    
3567
    if(last>=start_i){
3568
        run=0;
3569
        for(i=start_i; i<last; i++){
3570
            int j= scantable[i];
3571
            level= temp[j];
3572

    
3573
            if(level){
3574
                level+=64;
3575
                if((level&(~127)) == 0){
3576
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3577
                }else
3578
                    bits+= esc_length;
3579
                run=0;
3580
            }else
3581
                run++;
3582
        }
3583
        i= scantable[last];
3584

    
3585
        level= temp[i] + 64;
3586

    
3587
        assert(level - 64);
3588

    
3589
        if((level&(~127)) == 0){
3590
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3591
        }else
3592
            bits+= esc_length;
3593

    
3594
    }
3595

    
3596
    if(last>=0){
3597
        if(s->mb_intra)
3598
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3599
        else
3600
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3601
    }
3602

    
3603
    s->dsp.idct_add(bak, stride, temp);
3604

    
3605
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3606

    
3607
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3608
}
3609

    
3610
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3611
    MpegEncContext * const s= (MpegEncContext *)c;
3612
    const uint8_t *scantable= s->intra_scantable.permutated;
3613
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3614
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3615
    int i, last, run, bits, level, start_i;
3616
    const int esc_length= s->ac_esc_length;
3617
    uint8_t * length;
3618
    uint8_t * last_length;
3619

    
3620
    assert(h==8);
3621

    
3622
    s->dsp.diff_pixels(temp, src1, src2, stride);
3623

    
3624
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3625

    
3626
    bits=0;
3627

    
3628
    if (s->mb_intra) {
3629
        start_i = 1;
3630
        length     = s->intra_ac_vlc_length;
3631
        last_length= s->intra_ac_vlc_last_length;
3632
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3633
    } else {
3634
        start_i = 0;
3635
        length     = s->inter_ac_vlc_length;
3636
        last_length= s->inter_ac_vlc_last_length;
3637
    }
3638

    
3639
    if(last>=start_i){
3640
        run=0;
3641
        for(i=start_i; i<last; i++){
3642
            int j= scantable[i];
3643
            level= temp[j];
3644

    
3645
            if(level){
3646
                level+=64;
3647
                if((level&(~127)) == 0){
3648
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3649
                }else
3650
                    bits+= esc_length;
3651
                run=0;
3652
            }else
3653
                run++;
3654
        }
3655
        i= scantable[last];
3656

    
3657
        level= temp[i] + 64;
3658

    
3659
        assert(level - 64);
3660

    
3661
        if((level&(~127)) == 0){
3662
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3663
        }else
3664
            bits+= esc_length;
3665
    }
3666

    
3667
    return bits;
3668
}
3669

    
3670
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3671
    int score=0;
3672
    int x,y;
3673

    
3674
    for(y=1; y<h; y++){
3675
        for(x=0; x<16; x+=4){
3676
            score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3677
                   +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3678
        }
3679
        s+= stride;
3680
    }
3681

    
3682
    return score;
3683
}
3684

    
3685
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3686
    int score=0;
3687
    int x,y;
3688

    
3689
    for(y=1; y<h; y++){
3690
        for(x=0; x<16; x++){
3691
            score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3692
        }
3693
        s1+= stride;
3694
        s2+= stride;
3695
    }
3696

    
3697
    return score;
3698
}
3699

    
3700
#define SQ(a) ((a)*(a))
3701
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3702
    int score=0;
3703
    int x,y;
3704

    
3705
    for(y=1; y<h; y++){
3706
        for(x=0; x<16; x+=4){
3707
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3708
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3709
        }
3710
        s+= stride;
3711
    }
3712

    
3713
    return score;
3714
}
3715

    
3716
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3717
    int score=0;
3718
    int x,y;
3719

    
3720
    for(y=1; y<h; y++){
3721
        for(x=0; x<16; x++){
3722
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3723
        }
3724
        s1+= stride;
3725
        s2+= stride;
3726
    }
3727

    
3728
    return score;
3729
}
3730

    
3731
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3732
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3733
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3734
#ifdef CONFIG_GPL
3735
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3736
#endif
3737
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3738
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3739
WARPER8_16_SQ(rd8x8_c, rd16_c)
3740
WARPER8_16_SQ(bit8x8_c, bit16_c)
3741

    
3742
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3743
 converted */
3744
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3745
{
3746
    j_rev_dct (block);
3747
    put_pixels_clamped_c(block, dest, line_size);
3748
}
3749
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3750
{
3751
    j_rev_dct (block);
3752
    add_pixels_clamped_c(block, dest, line_size);
3753
}
3754

    
3755
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3756
{
3757
    j_rev_dct4 (block);
3758
    put_pixels_clamped4_c(block, dest, line_size);
3759
}
3760
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3761
{
3762
    j_rev_dct4 (block);
3763
    add_pixels_clamped4_c(block, dest, line_size);
3764
}
3765

    
3766
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3767
{
3768
    j_rev_dct2 (block);
3769
    put_pixels_clamped2_c(block, dest, line_size);
3770
}
3771
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3772
{
3773
    j_rev_dct2 (block);
3774
    add_pixels_clamped2_c(block, dest, line_size);
3775
}
3776

    
3777
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3778
{
3779
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3780

    
3781
    dest[0] = cm[(block[0] + 4)>>3];
3782
}
3783
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3784
{
3785
    uint8_t *cm = cropTbl + MAX_NEG_CROP;
3786

    
3787
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3788
}
3789

    
3790
static void just_return() { return; }
3791

    
3792
/* init static data */
3793
void dsputil_static_init(void)
3794
{
3795
    int i;
3796

    
3797
    for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3798
    for(i=0;i<MAX_NEG_CROP;i++) {
3799
        cropTbl[i] = 0;
3800
        cropTbl[i + MAX_NEG_CROP + 256] = 255;
3801
    }
3802

    
3803
    for(i=0;i<512;i++) {
3804
        squareTbl[i] = (i - 256) * (i - 256);
3805
    }
3806

    
3807
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3808
}
3809

    
3810

    
3811
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3812
{
3813
    int i;
3814

    
3815
#ifdef CONFIG_ENCODERS
3816
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3817
        c->fdct = fdct_ifast;
3818
        c->fdct248 = fdct_ifast248;
3819
    }
3820
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3821
        c->fdct = ff_faandct;
3822
        c->fdct248 = ff_faandct248;
3823
    }
3824
    else {
3825
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3826
        c->fdct248 = ff_fdct248_islow;
3827
    }
3828
#endif //CONFIG_ENCODERS
3829

    
3830
    if(avctx->lowres==1){
3831
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3832
            c->idct_put= ff_jref_idct4_put;
3833
            c->idct_add= ff_jref_idct4_add;
3834
        }else{
3835
            c->idct_put= ff_h264_lowres_idct_put_c;
3836
            c->idct_add= ff_h264_lowres_idct_add_c;
3837
        }
3838
        c->idct    = j_rev_dct4;
3839
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3840
    }else if(avctx->lowres==2){
3841
        c->idct_put= ff_jref_idct2_put;
3842
        c->idct_add= ff_jref_idct2_add;
3843
        c->idct    = j_rev_dct2;
3844
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3845
    }else if(avctx->lowres==3){
3846
        c->idct_put= ff_jref_idct1_put;
3847
        c->idct_add= ff_jref_idct1_add;
3848
        c->idct    = j_rev_dct1;
3849
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3850
    }else{
3851
        if(avctx->idct_algo==FF_IDCT_INT){
3852
            c->idct_put= ff_jref_idct_put;
3853
            c->idct_add= ff_jref_idct_add;
3854
            c->idct    = j_rev_dct;
3855
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3856
        }else if(avctx->idct_algo==FF_IDCT_VP3){
3857
            c->idct_put= ff_vp3_idct_put_c;
3858
            c->idct_add= ff_vp3_idct_add_c;
3859
            c->idct    = ff_vp3_idct_c;
3860
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3861
        }else{ //accurate/default
3862
            c->idct_put= simple_idct_put;
3863
            c->idct_add= simple_idct_add;
3864
            c->idct    = simple_idct;
3865
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3866
        }
3867
    }
3868

    
3869
    c->h264_idct_add= ff_h264_idct_add_c;
3870
    c->h264_idct8_add= ff_h264_idct8_add_c;
3871
    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3872
    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3873

    
3874
    c->get_pixels = get_pixels_c;
3875
    c->diff_pixels = diff_pixels_c;
3876
    c->put_pixels_clamped = put_pixels_clamped_c;
3877
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3878
    c->add_pixels_clamped = add_pixels_clamped_c;
3879
    c->add_pixels8 = add_pixels8_c;
3880
    c->add_pixels4 = add_pixels4_c;
3881
    c->gmc1 = gmc1_c;
3882
    c->gmc = ff_gmc_c;
3883
    c->clear_blocks = clear_blocks_c;
3884
    c->pix_sum = pix_sum_c;
3885
    c->pix_norm1 = pix_norm1_c;
3886

    
3887
    /* TODO [0] 16  [1] 8 */
3888
    c->pix_abs[0][0] = pix_abs16_c;
3889
    c->pix_abs[0][1] = pix_abs16_x2_c;
3890
    c->pix_abs[0][2] = pix_abs16_y2_c;
3891
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3892
    c->pix_abs[1][0] = pix_abs8_c;
3893
    c->pix_abs[1][1] = pix_abs8_x2_c;
3894
    c->pix_abs[1][2] = pix_abs8_y2_c;
3895
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3896

    
3897
#define dspfunc(PFX, IDX, NUM) \
3898
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3899
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3900
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3901
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3902

    
3903
    dspfunc(put, 0, 16);
3904
    dspfunc(put_no_rnd, 0, 16);
3905
    dspfunc(put, 1, 8);
3906
    dspfunc(put_no_rnd, 1, 8);
3907
    dspfunc(put, 2, 4);
3908
    dspfunc(put, 3, 2);
3909

    
3910
    dspfunc(avg, 0, 16);
3911
    dspfunc(avg_no_rnd, 0, 16);
3912
    dspfunc(avg, 1, 8);
3913
    dspfunc(avg_no_rnd, 1, 8);
3914
    dspfunc(avg, 2, 4);
3915
    dspfunc(avg, 3, 2);
3916
#undef dspfunc
3917

    
3918
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3919
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3920

    
3921
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3922
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3923
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3924
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3925
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3926
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3927
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3928
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3929
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3930

    
3931
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3932
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3933
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3934
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3935
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3936
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3937
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3938
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3939
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3940

    
3941
#define dspfunc(PFX, IDX, NUM) \
3942
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3943
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3944
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3945
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3946
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3947
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3948
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3949
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3950
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3951
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3952
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3953
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3954
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3955
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3956
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3957
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3958

    
3959
    dspfunc(put_qpel, 0, 16);
3960
    dspfunc(put_no_rnd_qpel, 0, 16);
3961

    
3962
    dspfunc(avg_qpel, 0, 16);
3963
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3964

    
3965
    dspfunc(put_qpel, 1, 8);
3966
    dspfunc(put_no_rnd_qpel, 1, 8);
3967

    
3968
    dspfunc(avg_qpel, 1, 8);
3969
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3970

    
3971
    dspfunc(put_h264_qpel, 0, 16);
3972
    dspfunc(put_h264_qpel, 1, 8);
3973
    dspfunc(put_h264_qpel, 2, 4);
3974
    dspfunc(put_h264_qpel, 3, 2);
3975
    dspfunc(avg_h264_qpel, 0, 16);
3976
    dspfunc(avg_h264_qpel, 1, 8);
3977
    dspfunc(avg_h264_qpel, 2, 4);
3978

    
3979
#undef dspfunc
3980
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3981
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3982
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3983
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3984
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3985
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3986

    
3987
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3988
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3989
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3990
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3991
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3992
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3993
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3994
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3995
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3996
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3997
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3998
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3999
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4000
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4001
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4002
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4003
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4004
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4005
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4006
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4007

    
4008
    ff_cavsdsp_init(c,avctx);
4009

    
4010
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4011
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4012
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4013
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4014
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4015
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4016
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4017
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4018

    
4019
#define SET_CMP_FUNC(name) \
4020
    c->name[0]= name ## 16_c;\
4021
    c->name[1]= name ## 8x8_c;
4022

    
4023
    SET_CMP_FUNC(hadamard8_diff)
4024
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4025
    SET_CMP_FUNC(dct_sad)
4026
    SET_CMP_FUNC(dct_max)
4027
#ifdef CONFIG_GPL
4028
    SET_CMP_FUNC(dct264_sad)
4029
#endif
4030
    c->sad[0]= pix_abs16_c;
4031
    c->sad[1]= pix_abs8_c;
4032
    c->sse[0]= sse16_c;
4033
    c->sse[1]= sse8_c;
4034
    c->sse[2]= sse4_c;
4035
    SET_CMP_FUNC(quant_psnr)
4036
    SET_CMP_FUNC(rd)
4037
    SET_CMP_FUNC(bit)
4038
    c->vsad[0]= vsad16_c;
4039
    c->vsad[4]= vsad_intra16_c;
4040
    c->vsse[0]= vsse16_c;
4041
    c->vsse[4]= vsse_intra16_c;
4042
    c->nsse[0]= nsse16_c;
4043
    c->nsse[1]= nsse8_c;
4044
#ifdef CONFIG_SNOW_ENCODER
4045
    c->w53[0]= w53_16_c;
4046
    c->w53[1]= w53_8_c;
4047
    c->w97[0]= w97_16_c;
4048
    c->w97[1]= w97_8_c;
4049
#endif
4050

    
4051
    c->add_bytes= add_bytes_c;
4052
    c->diff_bytes= diff_bytes_c;
4053
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4054
    c->bswap_buf= bswap_buf;
4055

    
4056
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4057
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4058
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4059
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4060
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4061
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4062

    
4063
    c->h263_h_loop_filter= h263_h_loop_filter_c;
4064
    c->h263_v_loop_filter= h263_v_loop_filter_c;
4065

    
4066
    c->h261_loop_filter= h261_loop_filter_c;
4067

    
4068
    c->try_8x8basis= try_8x8basis_c;
4069
    c->add_8x8basis= add_8x8basis_c;
4070

    
4071
#ifdef CONFIG_SNOW_ENCODER
4072
    c->vertical_compose97i = ff_snow_vertical_compose97i;
4073
    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4074
    c->inner_add_yblock = ff_snow_inner_add_yblock;
4075
#endif
4076

    
4077
    c->shrink[0]= ff_img_copy_plane;
4078
    c->shrink[1]= ff_shrink22;
4079
    c->shrink[2]= ff_shrink44;
4080
    c->shrink[3]= ff_shrink88;
4081

    
4082
    c->prefetch= just_return;
4083

    
4084
#ifdef HAVE_MMX
4085
    dsputil_init_mmx(c, avctx);
4086
#endif
4087
#ifdef ARCH_ARMV4L
4088
    dsputil_init_armv4l(c, avctx);
4089
#endif
4090
#ifdef HAVE_MLIB
4091
    dsputil_init_mlib(c, avctx);
4092
#endif
4093
#ifdef ARCH_SPARC
4094
   dsputil_init_vis(c,avctx);
4095
#endif
4096
#ifdef ARCH_ALPHA
4097
    dsputil_init_alpha(c, avctx);
4098
#endif
4099
#ifdef ARCH_POWERPC
4100
    dsputil_init_ppc(c, avctx);
4101
#endif
4102
#ifdef HAVE_MMI
4103
    dsputil_init_mmi(c, avctx);
4104
#endif
4105
#ifdef ARCH_SH4
4106
    dsputil_init_sh4(c,avctx);
4107
#endif
4108

    
4109
    switch(c->idct_permutation_type){
4110
    case FF_NO_IDCT_PERM:
4111
        for(i=0; i<64; i++)
4112
            c->idct_permutation[i]= i;
4113
        break;
4114
    case FF_LIBMPEG2_IDCT_PERM:
4115
        for(i=0; i<64; i++)
4116
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4117
        break;
4118
    case FF_SIMPLE_IDCT_PERM:
4119
        for(i=0; i<64; i++)
4120
            c->idct_permutation[i]= simple_mmx_permutation[i];
4121
        break;
4122
    case FF_TRANSPOSE_IDCT_PERM:
4123
        for(i=0; i<64; i++)
4124
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4125
        break;
4126
    case FF_PARTTRANS_IDCT_PERM:
4127
        for(i=0; i<64; i++)
4128
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4129
        break;
4130
    default:
4131
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4132
    }
4133
}
4134