Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ edecaff8

History | View | Annotate | Download (149 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "mpegvideo.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45
uint32_t ff_squareTbl[512] = {0, };
46

    
47
const uint8_t ff_zigzag_direct[64] = {
48
    0,   1,  8, 16,  9,  2,  3, 10,
49
    17, 24, 32, 25, 18, 11,  4,  5,
50
    12, 19, 26, 33, 40, 48, 41, 34,
51
    27, 20, 13,  6,  7, 14, 21, 28,
52
    35, 42, 49, 56, 57, 50, 43, 36,
53
    29, 22, 15, 23, 30, 37, 44, 51,
54
    58, 59, 52, 45, 38, 31, 39, 46,
55
    53, 60, 61, 54, 47, 55, 62, 63
56
};
57

    
58
/* Specific zigzag scan for 248 idct. NOTE that unlike the
59
   specification, we interleave the fields */
60
const uint8_t ff_zigzag248_direct[64] = {
61
     0,  8,  1,  9, 16, 24,  2, 10,
62
    17, 25, 32, 40, 48, 56, 33, 41,
63
    18, 26,  3, 11,  4, 12, 19, 27,
64
    34, 42, 49, 57, 50, 58, 35, 43,
65
    20, 28,  5, 13,  6, 14, 21, 29,
66
    36, 44, 51, 59, 52, 60, 37, 45,
67
    22, 30,  7, 15, 23, 31, 38, 46,
68
    53, 61, 54, 62, 39, 47, 55, 63,
69
};
70

    
71
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
72
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
73

    
74
const uint8_t ff_alternate_horizontal_scan[64] = {
75
    0,  1,   2,  3,  8,  9, 16, 17,
76
    10, 11,  4,  5,  6,  7, 15, 14,
77
    13, 12, 19, 18, 24, 25, 32, 33,
78
    26, 27, 20, 21, 22, 23, 28, 29,
79
    30, 31, 34, 35, 40, 41, 48, 49,
80
    42, 43, 36, 37, 38, 39, 44, 45,
81
    46, 47, 50, 51, 56, 57, 58, 59,
82
    52, 53, 54, 55, 60, 61, 62, 63,
83
};
84

    
85
const uint8_t ff_alternate_vertical_scan[64] = {
86
    0,  8,  16, 24,  1,  9,  2, 10,
87
    17, 25, 32, 40, 48, 56, 57, 49,
88
    41, 33, 26, 18,  3, 11,  4, 12,
89
    19, 27, 34, 42, 50, 58, 35, 43,
90
    51, 59, 20, 28,  5, 13,  6, 14,
91
    21, 29, 36, 44, 52, 60, 37, 45,
92
    53, 61, 22, 30,  7, 15, 23, 31,
93
    38, 46, 54, 62, 39, 47, 55, 63,
94
};
95

    
96
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
97
const uint32_t ff_inverse[256]={
98
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
99
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
100
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
101
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
102
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
103
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
104
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
105
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
106
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
107
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
108
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
109
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
110
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
111
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
112
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
113
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
114
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
115
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
116
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
117
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
118
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
119
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
120
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
121
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
122
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
123
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
124
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
125
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
126
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
127
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
128
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
129
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
130
};
131

    
132
/* Input permutation for the simple_idct_mmx */
133
static const uint8_t simple_mmx_permutation[64]={
134
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
135
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
136
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
137
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
138
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
139
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
140
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
141
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
142
};
143

    
144
static int pix_sum_c(uint8_t * pix, int line_size)
145
{
146
    int s, i, j;
147

    
148
    s = 0;
149
    for (i = 0; i < 16; i++) {
150
        for (j = 0; j < 16; j += 8) {
151
            s += pix[0];
152
            s += pix[1];
153
            s += pix[2];
154
            s += pix[3];
155
            s += pix[4];
156
            s += pix[5];
157
            s += pix[6];
158
            s += pix[7];
159
            pix += 8;
160
        }
161
        pix += line_size - 16;
162
    }
163
    return s;
164
}
165

    
166
static int pix_norm1_c(uint8_t * pix, int line_size)
167
{
168
    int s, i, j;
169
    uint32_t *sq = ff_squareTbl + 256;
170

    
171
    s = 0;
172
    for (i = 0; i < 16; i++) {
173
        for (j = 0; j < 16; j += 8) {
174
#if 0
175
            s += sq[pix[0]];
176
            s += sq[pix[1]];
177
            s += sq[pix[2]];
178
            s += sq[pix[3]];
179
            s += sq[pix[4]];
180
            s += sq[pix[5]];
181
            s += sq[pix[6]];
182
            s += sq[pix[7]];
183
#else
184
#if LONG_MAX > 2147483647
185
            register uint64_t x=*(uint64_t*)pix;
186
            s += sq[x&0xff];
187
            s += sq[(x>>8)&0xff];
188
            s += sq[(x>>16)&0xff];
189
            s += sq[(x>>24)&0xff];
190
            s += sq[(x>>32)&0xff];
191
            s += sq[(x>>40)&0xff];
192
            s += sq[(x>>48)&0xff];
193
            s += sq[(x>>56)&0xff];
194
#else
195
            register uint32_t x=*(uint32_t*)pix;
196
            s += sq[x&0xff];
197
            s += sq[(x>>8)&0xff];
198
            s += sq[(x>>16)&0xff];
199
            s += sq[(x>>24)&0xff];
200
            x=*(uint32_t*)(pix+4);
201
            s += sq[x&0xff];
202
            s += sq[(x>>8)&0xff];
203
            s += sq[(x>>16)&0xff];
204
            s += sq[(x>>24)&0xff];
205
#endif
206
#endif
207
            pix += 8;
208
        }
209
        pix += line_size - 16;
210
    }
211
    return s;
212
}
213

    
214
static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
215
    int i;
216

    
217
    for(i=0; i+8<=w; i+=8){
218
        dst[i+0]= bswap_32(src[i+0]);
219
        dst[i+1]= bswap_32(src[i+1]);
220
        dst[i+2]= bswap_32(src[i+2]);
221
        dst[i+3]= bswap_32(src[i+3]);
222
        dst[i+4]= bswap_32(src[i+4]);
223
        dst[i+5]= bswap_32(src[i+5]);
224
        dst[i+6]= bswap_32(src[i+6]);
225
        dst[i+7]= bswap_32(src[i+7]);
226
    }
227
    for(;i<w; i++){
228
        dst[i+0]= bswap_32(src[i+0]);
229
    }
230
}
231

    
232
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
233
{
234
    int s, i;
235
    uint32_t *sq = ff_squareTbl + 256;
236

    
237
    s = 0;
238
    for (i = 0; i < h; i++) {
239
        s += sq[pix1[0] - pix2[0]];
240
        s += sq[pix1[1] - pix2[1]];
241
        s += sq[pix1[2] - pix2[2]];
242
        s += sq[pix1[3] - pix2[3]];
243
        pix1 += line_size;
244
        pix2 += line_size;
245
    }
246
    return s;
247
}
248

    
249
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
250
{
251
    int s, i;
252
    uint32_t *sq = ff_squareTbl + 256;
253

    
254
    s = 0;
255
    for (i = 0; i < h; i++) {
256
        s += sq[pix1[0] - pix2[0]];
257
        s += sq[pix1[1] - pix2[1]];
258
        s += sq[pix1[2] - pix2[2]];
259
        s += sq[pix1[3] - pix2[3]];
260
        s += sq[pix1[4] - pix2[4]];
261
        s += sq[pix1[5] - pix2[5]];
262
        s += sq[pix1[6] - pix2[6]];
263
        s += sq[pix1[7] - pix2[7]];
264
        pix1 += line_size;
265
        pix2 += line_size;
266
    }
267
    return s;
268
}
269

    
270
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
271
{
272
    int s, i;
273
    uint32_t *sq = ff_squareTbl + 256;
274

    
275
    s = 0;
276
    for (i = 0; i < h; i++) {
277
        s += sq[pix1[ 0] - pix2[ 0]];
278
        s += sq[pix1[ 1] - pix2[ 1]];
279
        s += sq[pix1[ 2] - pix2[ 2]];
280
        s += sq[pix1[ 3] - pix2[ 3]];
281
        s += sq[pix1[ 4] - pix2[ 4]];
282
        s += sq[pix1[ 5] - pix2[ 5]];
283
        s += sq[pix1[ 6] - pix2[ 6]];
284
        s += sq[pix1[ 7] - pix2[ 7]];
285
        s += sq[pix1[ 8] - pix2[ 8]];
286
        s += sq[pix1[ 9] - pix2[ 9]];
287
        s += sq[pix1[10] - pix2[10]];
288
        s += sq[pix1[11] - pix2[11]];
289
        s += sq[pix1[12] - pix2[12]];
290
        s += sq[pix1[13] - pix2[13]];
291
        s += sq[pix1[14] - pix2[14]];
292
        s += sq[pix1[15] - pix2[15]];
293

    
294
        pix1 += line_size;
295
        pix2 += line_size;
296
    }
297
    return s;
298
}
299

    
300

    
301
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
302
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
303
    int s, i, j;
304
    const int dec_count= w==8 ? 3 : 4;
305
    int tmp[32*32];
306
    int level, ori;
307
    static const int scale[2][2][4][4]={
308
      {
309
        {
310
            // 9/7 8x8 dec=3
311
            {268, 239, 239, 213},
312
            {  0, 224, 224, 152},
313
            {  0, 135, 135, 110},
314
        },{
315
            // 9/7 16x16 or 32x32 dec=4
316
            {344, 310, 310, 280},
317
            {  0, 320, 320, 228},
318
            {  0, 175, 175, 136},
319
            {  0, 129, 129, 102},
320
        }
321
      },{
322
        {
323
            // 5/3 8x8 dec=3
324
            {275, 245, 245, 218},
325
            {  0, 230, 230, 156},
326
            {  0, 138, 138, 113},
327
        },{
328
            // 5/3 16x16 or 32x32 dec=4
329
            {352, 317, 317, 286},
330
            {  0, 328, 328, 233},
331
            {  0, 180, 180, 140},
332
            {  0, 132, 132, 105},
333
        }
334
      }
335
    };
336

    
337
    for (i = 0; i < h; i++) {
338
        for (j = 0; j < w; j+=4) {
339
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
340
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
341
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
342
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
343
        }
344
        pix1 += line_size;
345
        pix2 += line_size;
346
    }
347

    
348
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
349

    
350
    s=0;
351
    assert(w==h);
352
    for(level=0; level<dec_count; level++){
353
        for(ori= level ? 1 : 0; ori<4; ori++){
354
            int size= w>>(dec_count-level);
355
            int sx= (ori&1) ? size : 0;
356
            int stride= 32<<(dec_count-level);
357
            int sy= (ori&2) ? stride>>1 : 0;
358

    
359
            for(i=0; i<size; i++){
360
                for(j=0; j<size; j++){
361
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
362
                    s += FFABS(v);
363
                }
364
            }
365
        }
366
    }
367
    assert(s>=0);
368
    return s>>9;
369
}
370

    
371
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
373
}
374

    
375
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
377
}
378

    
379
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
381
}
382

    
383
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
385
}
386

    
387
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
388
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
389
}
390

    
391
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
392
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
393
}
394
#endif
395

    
396
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
397
{
398
    int i;
399

    
400
    /* read the pixels */
401
    for(i=0;i<8;i++) {
402
        block[0] = pixels[0];
403
        block[1] = pixels[1];
404
        block[2] = pixels[2];
405
        block[3] = pixels[3];
406
        block[4] = pixels[4];
407
        block[5] = pixels[5];
408
        block[6] = pixels[6];
409
        block[7] = pixels[7];
410
        pixels += line_size;
411
        block += 8;
412
    }
413
}
414

    
415
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
416
                          const uint8_t *s2, int stride){
417
    int i;
418

    
419
    /* read the pixels */
420
    for(i=0;i<8;i++) {
421
        block[0] = s1[0] - s2[0];
422
        block[1] = s1[1] - s2[1];
423
        block[2] = s1[2] - s2[2];
424
        block[3] = s1[3] - s2[3];
425
        block[4] = s1[4] - s2[4];
426
        block[5] = s1[5] - s2[5];
427
        block[6] = s1[6] - s2[6];
428
        block[7] = s1[7] - s2[7];
429
        s1 += stride;
430
        s2 += stride;
431
        block += 8;
432
    }
433
}
434

    
435

    
436
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
437
                                 int line_size)
438
{
439
    int i;
440
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
441

    
442
    /* read the pixels */
443
    for(i=0;i<8;i++) {
444
        pixels[0] = cm[block[0]];
445
        pixels[1] = cm[block[1]];
446
        pixels[2] = cm[block[2]];
447
        pixels[3] = cm[block[3]];
448
        pixels[4] = cm[block[4]];
449
        pixels[5] = cm[block[5]];
450
        pixels[6] = cm[block[6]];
451
        pixels[7] = cm[block[7]];
452

    
453
        pixels += line_size;
454
        block += 8;
455
    }
456
}
457

    
458
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
459
                                 int line_size)
460
{
461
    int i;
462
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
463

    
464
    /* read the pixels */
465
    for(i=0;i<4;i++) {
466
        pixels[0] = cm[block[0]];
467
        pixels[1] = cm[block[1]];
468
        pixels[2] = cm[block[2]];
469
        pixels[3] = cm[block[3]];
470

    
471
        pixels += line_size;
472
        block += 8;
473
    }
474
}
475

    
476
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477
                                 int line_size)
478
{
479
    int i;
480
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
481

    
482
    /* read the pixels */
483
    for(i=0;i<2;i++) {
484
        pixels[0] = cm[block[0]];
485
        pixels[1] = cm[block[1]];
486

    
487
        pixels += line_size;
488
        block += 8;
489
    }
490
}
491

    
492
static void put_signed_pixels_clamped_c(const DCTELEM *block,
493
                                        uint8_t *restrict pixels,
494
                                        int line_size)
495
{
496
    int i, j;
497

    
498
    for (i = 0; i < 8; i++) {
499
        for (j = 0; j < 8; j++) {
500
            if (*block < -128)
501
                *pixels = 0;
502
            else if (*block > 127)
503
                *pixels = 255;
504
            else
505
                *pixels = (uint8_t)(*block + 128);
506
            block++;
507
            pixels++;
508
        }
509
        pixels += (line_size - 8);
510
    }
511
}
512

    
513
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
514
                          int line_size)
515
{
516
    int i;
517
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
518

    
519
    /* read the pixels */
520
    for(i=0;i<8;i++) {
521
        pixels[0] = cm[pixels[0] + block[0]];
522
        pixels[1] = cm[pixels[1] + block[1]];
523
        pixels[2] = cm[pixels[2] + block[2]];
524
        pixels[3] = cm[pixels[3] + block[3]];
525
        pixels[4] = cm[pixels[4] + block[4]];
526
        pixels[5] = cm[pixels[5] + block[5]];
527
        pixels[6] = cm[pixels[6] + block[6]];
528
        pixels[7] = cm[pixels[7] + block[7]];
529
        pixels += line_size;
530
        block += 8;
531
    }
532
}
533

    
534
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
535
                          int line_size)
536
{
537
    int i;
538
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
539

    
540
    /* read the pixels */
541
    for(i=0;i<4;i++) {
542
        pixels[0] = cm[pixels[0] + block[0]];
543
        pixels[1] = cm[pixels[1] + block[1]];
544
        pixels[2] = cm[pixels[2] + block[2]];
545
        pixels[3] = cm[pixels[3] + block[3]];
546
        pixels += line_size;
547
        block += 8;
548
    }
549
}
550

    
551
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
552
                          int line_size)
553
{
554
    int i;
555
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
556

    
557
    /* read the pixels */
558
    for(i=0;i<2;i++) {
559
        pixels[0] = cm[pixels[0] + block[0]];
560
        pixels[1] = cm[pixels[1] + block[1]];
561
        pixels += line_size;
562
        block += 8;
563
    }
564
}
565

    
566
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
567
{
568
    int i;
569
    for(i=0;i<8;i++) {
570
        pixels[0] += block[0];
571
        pixels[1] += block[1];
572
        pixels[2] += block[2];
573
        pixels[3] += block[3];
574
        pixels[4] += block[4];
575
        pixels[5] += block[5];
576
        pixels[6] += block[6];
577
        pixels[7] += block[7];
578
        pixels += line_size;
579
        block += 8;
580
    }
581
}
582

    
583
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
584
{
585
    int i;
586
    for(i=0;i<4;i++) {
587
        pixels[0] += block[0];
588
        pixels[1] += block[1];
589
        pixels[2] += block[2];
590
        pixels[3] += block[3];
591
        pixels += line_size;
592
        block += 4;
593
    }
594
}
595

    
596
static int sum_abs_dctelem_c(DCTELEM *block)
597
{
598
    int sum=0, i;
599
    for(i=0; i<64; i++)
600
        sum+= FFABS(block[i]);
601
    return sum;
602
}
603

    
604
#if 0
605

606
#define PIXOP2(OPNAME, OP) \
607
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
608
{\
609
    int i;\
610
    for(i=0; i<h; i++){\
611
        OP(*((uint64_t*)block), LD64(pixels));\
612
        pixels+=line_size;\
613
        block +=line_size;\
614
    }\
615
}\
616
\
617
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
618
{\
619
    int i;\
620
    for(i=0; i<h; i++){\
621
        const uint64_t a= LD64(pixels  );\
622
        const uint64_t b= LD64(pixels+1);\
623
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
624
        pixels+=line_size;\
625
        block +=line_size;\
626
    }\
627
}\
628
\
629
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
630
{\
631
    int i;\
632
    for(i=0; i<h; i++){\
633
        const uint64_t a= LD64(pixels  );\
634
        const uint64_t b= LD64(pixels+1);\
635
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
636
        pixels+=line_size;\
637
        block +=line_size;\
638
    }\
639
}\
640
\
641
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
642
{\
643
    int i;\
644
    for(i=0; i<h; i++){\
645
        const uint64_t a= LD64(pixels          );\
646
        const uint64_t b= LD64(pixels+line_size);\
647
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
648
        pixels+=line_size;\
649
        block +=line_size;\
650
    }\
651
}\
652
\
653
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
654
{\
655
    int i;\
656
    for(i=0; i<h; i++){\
657
        const uint64_t a= LD64(pixels          );\
658
        const uint64_t b= LD64(pixels+line_size);\
659
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
660
        pixels+=line_size;\
661
        block +=line_size;\
662
    }\
663
}\
664
\
665
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
666
{\
667
        int i;\
668
        const uint64_t a= LD64(pixels  );\
669
        const uint64_t b= LD64(pixels+1);\
670
        uint64_t l0=  (a&0x0303030303030303ULL)\
671
                    + (b&0x0303030303030303ULL)\
672
                    + 0x0202020202020202ULL;\
673
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
675
        uint64_t l1,h1;\
676
\
677
        pixels+=line_size;\
678
        for(i=0; i<h; i+=2){\
679
            uint64_t a= LD64(pixels  );\
680
            uint64_t b= LD64(pixels+1);\
681
            l1=  (a&0x0303030303030303ULL)\
682
               + (b&0x0303030303030303ULL);\
683
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
686
            pixels+=line_size;\
687
            block +=line_size;\
688
            a= LD64(pixels  );\
689
            b= LD64(pixels+1);\
690
            l0=  (a&0x0303030303030303ULL)\
691
               + (b&0x0303030303030303ULL)\
692
               + 0x0202020202020202ULL;\
693
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
694
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
695
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
696
            pixels+=line_size;\
697
            block +=line_size;\
698
        }\
699
}\
700
\
701
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
702
{\
703
        int i;\
704
        const uint64_t a= LD64(pixels  );\
705
        const uint64_t b= LD64(pixels+1);\
706
        uint64_t l0=  (a&0x0303030303030303ULL)\
707
                    + (b&0x0303030303030303ULL)\
708
                    + 0x0101010101010101ULL;\
709
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
711
        uint64_t l1,h1;\
712
\
713
        pixels+=line_size;\
714
        for(i=0; i<h; i+=2){\
715
            uint64_t a= LD64(pixels  );\
716
            uint64_t b= LD64(pixels+1);\
717
            l1=  (a&0x0303030303030303ULL)\
718
               + (b&0x0303030303030303ULL);\
719
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
722
            pixels+=line_size;\
723
            block +=line_size;\
724
            a= LD64(pixels  );\
725
            b= LD64(pixels+1);\
726
            l0=  (a&0x0303030303030303ULL)\
727
               + (b&0x0303030303030303ULL)\
728
               + 0x0101010101010101ULL;\
729
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
730
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
731
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
732
            pixels+=line_size;\
733
            block +=line_size;\
734
        }\
735
}\
736
\
737
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
738
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
739
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
740
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
741
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
742
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
743
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
744

745
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
746
#else // 64 bit variant
747

    
748
#define PIXOP2(OPNAME, OP) \
749
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
750
    int i;\
751
    for(i=0; i<h; i++){\
752
        OP(*((uint16_t*)(block  )), LD16(pixels  ));\
753
        pixels+=line_size;\
754
        block +=line_size;\
755
    }\
756
}\
757
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
758
    int i;\
759
    for(i=0; i<h; i++){\
760
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
761
        pixels+=line_size;\
762
        block +=line_size;\
763
    }\
764
}\
765
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766
    int i;\
767
    for(i=0; i<h; i++){\
768
        OP(*((uint32_t*)(block  )), LD32(pixels  ));\
769
        OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
770
        pixels+=line_size;\
771
        block +=line_size;\
772
    }\
773
}\
774
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
775
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
776
}\
777
\
778
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779
                                                int src_stride1, int src_stride2, int h){\
780
    int i;\
781
    for(i=0; i<h; i++){\
782
        uint32_t a,b;\
783
        a= LD32(&src1[i*src_stride1  ]);\
784
        b= LD32(&src2[i*src_stride2  ]);\
785
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
786
        a= LD32(&src1[i*src_stride1+4]);\
787
        b= LD32(&src2[i*src_stride2+4]);\
788
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
789
    }\
790
}\
791
\
792
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793
                                                int src_stride1, int src_stride2, int h){\
794
    int i;\
795
    for(i=0; i<h; i++){\
796
        uint32_t a,b;\
797
        a= LD32(&src1[i*src_stride1  ]);\
798
        b= LD32(&src2[i*src_stride2  ]);\
799
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
800
        a= LD32(&src1[i*src_stride1+4]);\
801
        b= LD32(&src2[i*src_stride2+4]);\
802
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
803
    }\
804
}\
805
\
806
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807
                                                int src_stride1, int src_stride2, int h){\
808
    int i;\
809
    for(i=0; i<h; i++){\
810
        uint32_t a,b;\
811
        a= LD32(&src1[i*src_stride1  ]);\
812
        b= LD32(&src2[i*src_stride2  ]);\
813
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
814
    }\
815
}\
816
\
817
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818
                                                int src_stride1, int src_stride2, int h){\
819
    int i;\
820
    for(i=0; i<h; i++){\
821
        uint32_t a,b;\
822
        a= LD16(&src1[i*src_stride1  ]);\
823
        b= LD16(&src2[i*src_stride2  ]);\
824
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
825
    }\
826
}\
827
\
828
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
829
                                                int src_stride1, int src_stride2, int h){\
830
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
831
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
832
}\
833
\
834
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
835
                                                int src_stride1, int src_stride2, int h){\
836
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
837
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
838
}\
839
\
840
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
841
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
842
}\
843
\
844
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
846
}\
847
\
848
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
850
}\
851
\
852
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
854
}\
855
\
856
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
857
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
858
    int i;\
859
    for(i=0; i<h; i++){\
860
        uint32_t a, b, c, d, l0, l1, h0, h1;\
861
        a= LD32(&src1[i*src_stride1]);\
862
        b= LD32(&src2[i*src_stride2]);\
863
        c= LD32(&src3[i*src_stride3]);\
864
        d= LD32(&src4[i*src_stride4]);\
865
        l0=  (a&0x03030303UL)\
866
           + (b&0x03030303UL)\
867
           + 0x02020202UL;\
868
        h0= ((a&0xFCFCFCFCUL)>>2)\
869
          + ((b&0xFCFCFCFCUL)>>2);\
870
        l1=  (c&0x03030303UL)\
871
           + (d&0x03030303UL);\
872
        h1= ((c&0xFCFCFCFCUL)>>2)\
873
          + ((d&0xFCFCFCFCUL)>>2);\
874
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
875
        a= LD32(&src1[i*src_stride1+4]);\
876
        b= LD32(&src2[i*src_stride2+4]);\
877
        c= LD32(&src3[i*src_stride3+4]);\
878
        d= LD32(&src4[i*src_stride4+4]);\
879
        l0=  (a&0x03030303UL)\
880
           + (b&0x03030303UL)\
881
           + 0x02020202UL;\
882
        h0= ((a&0xFCFCFCFCUL)>>2)\
883
          + ((b&0xFCFCFCFCUL)>>2);\
884
        l1=  (c&0x03030303UL)\
885
           + (d&0x03030303UL);\
886
        h1= ((c&0xFCFCFCFCUL)>>2)\
887
          + ((d&0xFCFCFCFCUL)>>2);\
888
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
889
    }\
890
}\
891
\
892
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
894
}\
895
\
896
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
898
}\
899
\
900
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
902
}\
903
\
904
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
906
}\
907
\
908
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
909
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
910
    int i;\
911
    for(i=0; i<h; i++){\
912
        uint32_t a, b, c, d, l0, l1, h0, h1;\
913
        a= LD32(&src1[i*src_stride1]);\
914
        b= LD32(&src2[i*src_stride2]);\
915
        c= LD32(&src3[i*src_stride3]);\
916
        d= LD32(&src4[i*src_stride4]);\
917
        l0=  (a&0x03030303UL)\
918
           + (b&0x03030303UL)\
919
           + 0x01010101UL;\
920
        h0= ((a&0xFCFCFCFCUL)>>2)\
921
          + ((b&0xFCFCFCFCUL)>>2);\
922
        l1=  (c&0x03030303UL)\
923
           + (d&0x03030303UL);\
924
        h1= ((c&0xFCFCFCFCUL)>>2)\
925
          + ((d&0xFCFCFCFCUL)>>2);\
926
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
927
        a= LD32(&src1[i*src_stride1+4]);\
928
        b= LD32(&src2[i*src_stride2+4]);\
929
        c= LD32(&src3[i*src_stride3+4]);\
930
        d= LD32(&src4[i*src_stride4+4]);\
931
        l0=  (a&0x03030303UL)\
932
           + (b&0x03030303UL)\
933
           + 0x01010101UL;\
934
        h0= ((a&0xFCFCFCFCUL)>>2)\
935
          + ((b&0xFCFCFCFCUL)>>2);\
936
        l1=  (c&0x03030303UL)\
937
           + (d&0x03030303UL);\
938
        h1= ((c&0xFCFCFCFCUL)>>2)\
939
          + ((d&0xFCFCFCFCUL)>>2);\
940
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
941
    }\
942
}\
943
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
944
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
945
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
946
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
947
}\
948
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
952
}\
953
\
954
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
955
{\
956
        int i, a0, b0, a1, b1;\
957
        a0= pixels[0];\
958
        b0= pixels[1] + 2;\
959
        a0 += b0;\
960
        b0 += pixels[2];\
961
\
962
        pixels+=line_size;\
963
        for(i=0; i<h; i+=2){\
964
            a1= pixels[0];\
965
            b1= pixels[1];\
966
            a1 += b1;\
967
            b1 += pixels[2];\
968
\
969
            block[0]= (a1+a0)>>2; /* FIXME non put */\
970
            block[1]= (b1+b0)>>2;\
971
\
972
            pixels+=line_size;\
973
            block +=line_size;\
974
\
975
            a0= pixels[0];\
976
            b0= pixels[1] + 2;\
977
            a0 += b0;\
978
            b0 += pixels[2];\
979
\
980
            block[0]= (a1+a0)>>2;\
981
            block[1]= (b1+b0)>>2;\
982
            pixels+=line_size;\
983
            block +=line_size;\
984
        }\
985
}\
986
\
987
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
988
{\
989
        int i;\
990
        const uint32_t a= LD32(pixels  );\
991
        const uint32_t b= LD32(pixels+1);\
992
        uint32_t l0=  (a&0x03030303UL)\
993
                    + (b&0x03030303UL)\
994
                    + 0x02020202UL;\
995
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
996
                   + ((b&0xFCFCFCFCUL)>>2);\
997
        uint32_t l1,h1;\
998
\
999
        pixels+=line_size;\
1000
        for(i=0; i<h; i+=2){\
1001
            uint32_t a= LD32(pixels  );\
1002
            uint32_t b= LD32(pixels+1);\
1003
            l1=  (a&0x03030303UL)\
1004
               + (b&0x03030303UL);\
1005
            h1= ((a&0xFCFCFCFCUL)>>2)\
1006
              + ((b&0xFCFCFCFCUL)>>2);\
1007
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008
            pixels+=line_size;\
1009
            block +=line_size;\
1010
            a= LD32(pixels  );\
1011
            b= LD32(pixels+1);\
1012
            l0=  (a&0x03030303UL)\
1013
               + (b&0x03030303UL)\
1014
               + 0x02020202UL;\
1015
            h0= ((a&0xFCFCFCFCUL)>>2)\
1016
              + ((b&0xFCFCFCFCUL)>>2);\
1017
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018
            pixels+=line_size;\
1019
            block +=line_size;\
1020
        }\
1021
}\
1022
\
1023
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1024
{\
1025
    int j;\
1026
    for(j=0; j<2; j++){\
1027
        int i;\
1028
        const uint32_t a= LD32(pixels  );\
1029
        const uint32_t b= LD32(pixels+1);\
1030
        uint32_t l0=  (a&0x03030303UL)\
1031
                    + (b&0x03030303UL)\
1032
                    + 0x02020202UL;\
1033
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1034
                   + ((b&0xFCFCFCFCUL)>>2);\
1035
        uint32_t l1,h1;\
1036
\
1037
        pixels+=line_size;\
1038
        for(i=0; i<h; i+=2){\
1039
            uint32_t a= LD32(pixels  );\
1040
            uint32_t b= LD32(pixels+1);\
1041
            l1=  (a&0x03030303UL)\
1042
               + (b&0x03030303UL);\
1043
            h1= ((a&0xFCFCFCFCUL)>>2)\
1044
              + ((b&0xFCFCFCFCUL)>>2);\
1045
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046
            pixels+=line_size;\
1047
            block +=line_size;\
1048
            a= LD32(pixels  );\
1049
            b= LD32(pixels+1);\
1050
            l0=  (a&0x03030303UL)\
1051
               + (b&0x03030303UL)\
1052
               + 0x02020202UL;\
1053
            h0= ((a&0xFCFCFCFCUL)>>2)\
1054
              + ((b&0xFCFCFCFCUL)>>2);\
1055
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1056
            pixels+=line_size;\
1057
            block +=line_size;\
1058
        }\
1059
        pixels+=4-line_size*(h+1);\
1060
        block +=4-line_size*h;\
1061
    }\
1062
}\
1063
\
1064
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1065
{\
1066
    int j;\
1067
    for(j=0; j<2; j++){\
1068
        int i;\
1069
        const uint32_t a= LD32(pixels  );\
1070
        const uint32_t b= LD32(pixels+1);\
1071
        uint32_t l0=  (a&0x03030303UL)\
1072
                    + (b&0x03030303UL)\
1073
                    + 0x01010101UL;\
1074
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1075
                   + ((b&0xFCFCFCFCUL)>>2);\
1076
        uint32_t l1,h1;\
1077
\
1078
        pixels+=line_size;\
1079
        for(i=0; i<h; i+=2){\
1080
            uint32_t a= LD32(pixels  );\
1081
            uint32_t b= LD32(pixels+1);\
1082
            l1=  (a&0x03030303UL)\
1083
               + (b&0x03030303UL);\
1084
            h1= ((a&0xFCFCFCFCUL)>>2)\
1085
              + ((b&0xFCFCFCFCUL)>>2);\
1086
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087
            pixels+=line_size;\
1088
            block +=line_size;\
1089
            a= LD32(pixels  );\
1090
            b= LD32(pixels+1);\
1091
            l0=  (a&0x03030303UL)\
1092
               + (b&0x03030303UL)\
1093
               + 0x01010101UL;\
1094
            h0= ((a&0xFCFCFCFCUL)>>2)\
1095
              + ((b&0xFCFCFCFCUL)>>2);\
1096
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1097
            pixels+=line_size;\
1098
            block +=line_size;\
1099
        }\
1100
        pixels+=4-line_size*(h+1);\
1101
        block +=4-line_size*h;\
1102
    }\
1103
}\
1104
\
1105
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1106
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1107
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1108
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1109
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1110
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1111
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1112
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1113

    
1114
#define op_avg(a, b) a = rnd_avg32(a, b)
1115
#endif
1116
#define op_put(a, b) a = b
1117

    
1118
PIXOP2(avg, op_avg)
1119
PIXOP2(put, op_put)
1120
#undef op_avg
1121
#undef op_put
1122

    
1123
#define avg2(a,b) ((a+b+1)>>1)
1124
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1125

    
1126
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1127
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1128
}
1129

    
1130
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1131
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1132
}
1133

    
1134
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1135
{
1136
    const int A=(16-x16)*(16-y16);
1137
    const int B=(   x16)*(16-y16);
1138
    const int C=(16-x16)*(   y16);
1139
    const int D=(   x16)*(   y16);
1140
    int i;
1141

    
1142
    for(i=0; i<h; i++)
1143
    {
1144
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1145
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1146
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1147
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1148
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1149
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1150
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1151
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1152
        dst+= stride;
1153
        src+= stride;
1154
    }
1155
}
1156

    
1157
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1158
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1159
{
1160
    int y, vx, vy;
1161
    const int s= 1<<shift;
1162

    
1163
    width--;
1164
    height--;
1165

    
1166
    for(y=0; y<h; y++){
1167
        int x;
1168

    
1169
        vx= ox;
1170
        vy= oy;
1171
        for(x=0; x<8; x++){ //XXX FIXME optimize
1172
            int src_x, src_y, frac_x, frac_y, index;
1173

    
1174
            src_x= vx>>16;
1175
            src_y= vy>>16;
1176
            frac_x= src_x&(s-1);
1177
            frac_y= src_y&(s-1);
1178
            src_x>>=shift;
1179
            src_y>>=shift;
1180

    
1181
            if((unsigned)src_x < width){
1182
                if((unsigned)src_y < height){
1183
                    index= src_x + src_y*stride;
1184
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1185
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1186
                                        + (  src[index+stride  ]*(s-frac_x)
1187
                                           + src[index+stride+1]*   frac_x )*   frac_y
1188
                                        + r)>>(shift*2);
1189
                }else{
1190
                    index= src_x + av_clip(src_y, 0, height)*stride;
1191
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1192
                                          + src[index       +1]*   frac_x )*s
1193
                                        + r)>>(shift*2);
1194
                }
1195
            }else{
1196
                if((unsigned)src_y < height){
1197
                    index= av_clip(src_x, 0, width) + src_y*stride;
1198
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1199
                                           + src[index+stride  ]*   frac_y )*s
1200
                                        + r)>>(shift*2);
1201
                }else{
1202
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1203
                    dst[y*stride + x]=    src[index         ];
1204
                }
1205
            }
1206

    
1207
            vx+= dxx;
1208
            vy+= dyx;
1209
        }
1210
        ox += dxy;
1211
        oy += dyy;
1212
    }
1213
}
1214

    
1215
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216
    switch(width){
1217
    case 2: put_pixels2_c (dst, src, stride, height); break;
1218
    case 4: put_pixels4_c (dst, src, stride, height); break;
1219
    case 8: put_pixels8_c (dst, src, stride, height); break;
1220
    case 16:put_pixels16_c(dst, src, stride, height); break;
1221
    }
1222
}
1223

    
1224
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225
    int i,j;
1226
    for (i=0; i < height; i++) {
1227
      for (j=0; j < width; j++) {
1228
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1229
      }
1230
      src += stride;
1231
      dst += stride;
1232
    }
1233
}
1234

    
1235
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236
    int i,j;
1237
    for (i=0; i < height; i++) {
1238
      for (j=0; j < width; j++) {
1239
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1240
      }
1241
      src += stride;
1242
      dst += stride;
1243
    }
1244
}
1245

    
1246
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247
    int i,j;
1248
    for (i=0; i < height; i++) {
1249
      for (j=0; j < width; j++) {
1250
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1251
      }
1252
      src += stride;
1253
      dst += stride;
1254
    }
1255
}
1256

    
1257
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258
    int i,j;
1259
    for (i=0; i < height; i++) {
1260
      for (j=0; j < width; j++) {
1261
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1262
      }
1263
      src += stride;
1264
      dst += stride;
1265
    }
1266
}
1267

    
1268
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269
    int i,j;
1270
    for (i=0; i < height; i++) {
1271
      for (j=0; j < width; j++) {
1272
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1273
      }
1274
      src += stride;
1275
      dst += stride;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    int i,j;
1281
    for (i=0; i < height; i++) {
1282
      for (j=0; j < width; j++) {
1283
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1284
      }
1285
      src += stride;
1286
      dst += stride;
1287
    }
1288
}
1289

    
1290
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    int i,j;
1292
    for (i=0; i < height; i++) {
1293
      for (j=0; j < width; j++) {
1294
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1295
      }
1296
      src += stride;
1297
      dst += stride;
1298
    }
1299
}
1300

    
1301
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302
    int i,j;
1303
    for (i=0; i < height; i++) {
1304
      for (j=0; j < width; j++) {
1305
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1306
      }
1307
      src += stride;
1308
      dst += stride;
1309
    }
1310
}
1311

    
1312
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313
    switch(width){
1314
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1315
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1316
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1317
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1318
    }
1319
}
1320

    
1321
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    int i,j;
1323
    for (i=0; i < height; i++) {
1324
      for (j=0; j < width; j++) {
1325
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1326
      }
1327
      src += stride;
1328
      dst += stride;
1329
    }
1330
}
1331

    
1332
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    int i,j;
1378
    for (i=0; i < height; i++) {
1379
      for (j=0; j < width; j++) {
1380
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1381
      }
1382
      src += stride;
1383
      dst += stride;
1384
    }
1385
}
1386

    
1387
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388
    int i,j;
1389
    for (i=0; i < height; i++) {
1390
      for (j=0; j < width; j++) {
1391
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1392
      }
1393
      src += stride;
1394
      dst += stride;
1395
    }
1396
}
1397

    
1398
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399
    int i,j;
1400
    for (i=0; i < height; i++) {
1401
      for (j=0; j < width; j++) {
1402
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1403
      }
1404
      src += stride;
1405
      dst += stride;
1406
    }
1407
}
1408
#if 0
1409
#define TPEL_WIDTH(width)\
1410
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1412
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1414
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1416
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1418
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1419
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1420
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1422
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1424
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1426
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1428
#endif
1429

    
1430
#define H264_CHROMA_MC(OPNAME, OP)\
1431
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1432
    const int A=(8-x)*(8-y);\
1433
    const int B=(  x)*(8-y);\
1434
    const int C=(8-x)*(  y);\
1435
    const int D=(  x)*(  y);\
1436
    int i;\
1437
    \
1438
    assert(x<8 && y<8 && x>=0 && y>=0);\
1439
\
1440
    for(i=0; i<h; i++)\
1441
    {\
1442
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1443
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1444
        dst+= stride;\
1445
        src+= stride;\
1446
    }\
1447
}\
1448
\
1449
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1450
    const int A=(8-x)*(8-y);\
1451
    const int B=(  x)*(8-y);\
1452
    const int C=(8-x)*(  y);\
1453
    const int D=(  x)*(  y);\
1454
    int i;\
1455
    \
1456
    assert(x<8 && y<8 && x>=0 && y>=0);\
1457
\
1458
    for(i=0; i<h; i++)\
1459
    {\
1460
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1461
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1462
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1463
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1464
        dst+= stride;\
1465
        src+= stride;\
1466
    }\
1467
}\
1468
\
1469
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1470
    const int A=(8-x)*(8-y);\
1471
    const int B=(  x)*(8-y);\
1472
    const int C=(8-x)*(  y);\
1473
    const int D=(  x)*(  y);\
1474
    int i;\
1475
    \
1476
    assert(x<8 && y<8 && x>=0 && y>=0);\
1477
\
1478
    for(i=0; i<h; i++)\
1479
    {\
1480
        OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1481
        OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1482
        OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1483
        OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1484
        OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1485
        OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1486
        OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1487
        OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1488
        dst+= stride;\
1489
        src+= stride;\
1490
    }\
1491
}
1492

    
1493
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1494
#define op_put(a, b) a = (((b) + 32)>>6)
1495

    
1496
H264_CHROMA_MC(put_       , op_put)
1497
H264_CHROMA_MC(avg_       , op_avg)
1498
#undef op_avg
1499
#undef op_put
1500

    
1501
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1502
    const int A=(8-x)*(8-y);
1503
    const int B=(  x)*(8-y);
1504
    const int C=(8-x)*(  y);
1505
    const int D=(  x)*(  y);
1506
    int i;
1507

    
1508
    assert(x<8 && y<8 && x>=0 && y>=0);
1509

    
1510
    for(i=0; i<h; i++)
1511
    {
1512
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1513
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1514
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1515
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1516
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1517
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1518
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1519
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1520
        dst+= stride;
1521
        src+= stride;
1522
    }
1523
}
1524

    
1525
#define QPEL_MC(r, OPNAME, RND, OP) \
1526
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1528
    int i;\
1529
    for(i=0; i<h; i++)\
1530
    {\
1531
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1539
        dst+=dstStride;\
1540
        src+=srcStride;\
1541
    }\
1542
}\
1543
\
1544
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545
    const int w=8;\
1546
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1547
    int i;\
1548
    for(i=0; i<w; i++)\
1549
    {\
1550
        const int src0= src[0*srcStride];\
1551
        const int src1= src[1*srcStride];\
1552
        const int src2= src[2*srcStride];\
1553
        const int src3= src[3*srcStride];\
1554
        const int src4= src[4*srcStride];\
1555
        const int src5= src[5*srcStride];\
1556
        const int src6= src[6*srcStride];\
1557
        const int src7= src[7*srcStride];\
1558
        const int src8= src[8*srcStride];\
1559
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1567
        dst++;\
1568
        src++;\
1569
    }\
1570
}\
1571
\
1572
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1574
    int i;\
1575
    \
1576
    for(i=0; i<h; i++)\
1577
    {\
1578
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1594
        dst+=dstStride;\
1595
        src+=srcStride;\
1596
    }\
1597
}\
1598
\
1599
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1601
    int i;\
1602
    const int w=16;\
1603
    for(i=0; i<w; i++)\
1604
    {\
1605
        const int src0= src[0*srcStride];\
1606
        const int src1= src[1*srcStride];\
1607
        const int src2= src[2*srcStride];\
1608
        const int src3= src[3*srcStride];\
1609
        const int src4= src[4*srcStride];\
1610
        const int src5= src[5*srcStride];\
1611
        const int src6= src[6*srcStride];\
1612
        const int src7= src[7*srcStride];\
1613
        const int src8= src[8*srcStride];\
1614
        const int src9= src[9*srcStride];\
1615
        const int src10= src[10*srcStride];\
1616
        const int src11= src[11*srcStride];\
1617
        const int src12= src[12*srcStride];\
1618
        const int src13= src[13*srcStride];\
1619
        const int src14= src[14*srcStride];\
1620
        const int src15= src[15*srcStride];\
1621
        const int src16= src[16*srcStride];\
1622
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1638
        dst++;\
1639
        src++;\
1640
    }\
1641
}\
1642
\
1643
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1645
}\
1646
\
1647
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1648
    uint8_t half[64];\
1649
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1651
}\
1652
\
1653
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1655
}\
1656
\
1657
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1658
    uint8_t half[64];\
1659
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1661
}\
1662
\
1663
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664
    uint8_t full[16*9];\
1665
    uint8_t half[64];\
1666
    copy_block9(full, src, 16, stride, 9);\
1667
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1669
}\
1670
\
1671
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672
    uint8_t full[16*9];\
1673
    copy_block9(full, src, 16, stride, 9);\
1674
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1675
}\
1676
\
1677
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678
    uint8_t full[16*9];\
1679
    uint8_t half[64];\
1680
    copy_block9(full, src, 16, stride, 9);\
1681
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1683
}\
1684
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685
    uint8_t full[16*9];\
1686
    uint8_t halfH[72];\
1687
    uint8_t halfV[64];\
1688
    uint8_t halfHV[64];\
1689
    copy_block9(full, src, 16, stride, 9);\
1690
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1694
}\
1695
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696
    uint8_t full[16*9];\
1697
    uint8_t halfH[72];\
1698
    uint8_t halfHV[64];\
1699
    copy_block9(full, src, 16, stride, 9);\
1700
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1704
}\
1705
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706
    uint8_t full[16*9];\
1707
    uint8_t halfH[72];\
1708
    uint8_t halfV[64];\
1709
    uint8_t halfHV[64];\
1710
    copy_block9(full, src, 16, stride, 9);\
1711
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1715
}\
1716
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717
    uint8_t full[16*9];\
1718
    uint8_t halfH[72];\
1719
    uint8_t halfHV[64];\
1720
    copy_block9(full, src, 16, stride, 9);\
1721
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1725
}\
1726
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727
    uint8_t full[16*9];\
1728
    uint8_t halfH[72];\
1729
    uint8_t halfV[64];\
1730
    uint8_t halfHV[64];\
1731
    copy_block9(full, src, 16, stride, 9);\
1732
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736
}\
1737
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738
    uint8_t full[16*9];\
1739
    uint8_t halfH[72];\
1740
    uint8_t halfHV[64];\
1741
    copy_block9(full, src, 16, stride, 9);\
1742
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1746
}\
1747
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748
    uint8_t full[16*9];\
1749
    uint8_t halfH[72];\
1750
    uint8_t halfV[64];\
1751
    uint8_t halfHV[64];\
1752
    copy_block9(full, src, 16, stride, 9);\
1753
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1754
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757
}\
1758
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759
    uint8_t full[16*9];\
1760
    uint8_t halfH[72];\
1761
    uint8_t halfHV[64];\
1762
    copy_block9(full, src, 16, stride, 9);\
1763
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1767
}\
1768
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1769
    uint8_t halfH[72];\
1770
    uint8_t halfHV[64];\
1771
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1774
}\
1775
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1776
    uint8_t halfH[72];\
1777
    uint8_t halfHV[64];\
1778
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1781
}\
1782
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783
    uint8_t full[16*9];\
1784
    uint8_t halfH[72];\
1785
    uint8_t halfV[64];\
1786
    uint8_t halfHV[64];\
1787
    copy_block9(full, src, 16, stride, 9);\
1788
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1792
}\
1793
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794
    uint8_t full[16*9];\
1795
    uint8_t halfH[72];\
1796
    copy_block9(full, src, 16, stride, 9);\
1797
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1800
}\
1801
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t full[16*9];\
1803
    uint8_t halfH[72];\
1804
    uint8_t halfV[64];\
1805
    uint8_t halfHV[64];\
1806
    copy_block9(full, src, 16, stride, 9);\
1807
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1811
}\
1812
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813
    uint8_t full[16*9];\
1814
    uint8_t halfH[72];\
1815
    copy_block9(full, src, 16, stride, 9);\
1816
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1819
}\
1820
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1821
    uint8_t halfH[72];\
1822
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1824
}\
1825
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1827
}\
1828
\
1829
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1830
    uint8_t half[256];\
1831
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1833
}\
1834
\
1835
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1837
}\
1838
\
1839
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1840
    uint8_t half[256];\
1841
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1843
}\
1844
\
1845
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846
    uint8_t full[24*17];\
1847
    uint8_t half[256];\
1848
    copy_block17(full, src, 24, stride, 17);\
1849
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1851
}\
1852
\
1853
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854
    uint8_t full[24*17];\
1855
    copy_block17(full, src, 24, stride, 17);\
1856
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1857
}\
1858
\
1859
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[24*17];\
1861
    uint8_t half[256];\
1862
    copy_block17(full, src, 24, stride, 17);\
1863
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1865
}\
1866
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867
    uint8_t full[24*17];\
1868
    uint8_t halfH[272];\
1869
    uint8_t halfV[256];\
1870
    uint8_t halfHV[256];\
1871
    copy_block17(full, src, 24, stride, 17);\
1872
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1876
}\
1877
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878
    uint8_t full[24*17];\
1879
    uint8_t halfH[272];\
1880
    uint8_t halfHV[256];\
1881
    copy_block17(full, src, 24, stride, 17);\
1882
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1886
}\
1887
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888
    uint8_t full[24*17];\
1889
    uint8_t halfH[272];\
1890
    uint8_t halfV[256];\
1891
    uint8_t halfHV[256];\
1892
    copy_block17(full, src, 24, stride, 17);\
1893
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1897
}\
1898
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899
    uint8_t full[24*17];\
1900
    uint8_t halfH[272];\
1901
    uint8_t halfHV[256];\
1902
    copy_block17(full, src, 24, stride, 17);\
1903
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1907
}\
1908
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909
    uint8_t full[24*17];\
1910
    uint8_t halfH[272];\
1911
    uint8_t halfV[256];\
1912
    uint8_t halfHV[256];\
1913
    copy_block17(full, src, 24, stride, 17);\
1914
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918
}\
1919
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920
    uint8_t full[24*17];\
1921
    uint8_t halfH[272];\
1922
    uint8_t halfHV[256];\
1923
    copy_block17(full, src, 24, stride, 17);\
1924
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1928
}\
1929
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t full[24*17];\
1931
    uint8_t halfH[272];\
1932
    uint8_t halfV[256];\
1933
    uint8_t halfHV[256];\
1934
    copy_block17(full, src, 24, stride, 17);\
1935
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1936
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939
}\
1940
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941
    uint8_t full[24*17];\
1942
    uint8_t halfH[272];\
1943
    uint8_t halfHV[256];\
1944
    copy_block17(full, src, 24, stride, 17);\
1945
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1949
}\
1950
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951
    uint8_t halfH[272];\
1952
    uint8_t halfHV[256];\
1953
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1956
}\
1957
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958
    uint8_t halfH[272];\
1959
    uint8_t halfHV[256];\
1960
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1963
}\
1964
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965
    uint8_t full[24*17];\
1966
    uint8_t halfH[272];\
1967
    uint8_t halfV[256];\
1968
    uint8_t halfHV[256];\
1969
    copy_block17(full, src, 24, stride, 17);\
1970
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1974
}\
1975
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t full[24*17];\
1977
    uint8_t halfH[272];\
1978
    copy_block17(full, src, 24, stride, 17);\
1979
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1982
}\
1983
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t full[24*17];\
1985
    uint8_t halfH[272];\
1986
    uint8_t halfV[256];\
1987
    uint8_t halfHV[256];\
1988
    copy_block17(full, src, 24, stride, 17);\
1989
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1993
}\
1994
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995
    uint8_t full[24*17];\
1996
    uint8_t halfH[272];\
1997
    copy_block17(full, src, 24, stride, 17);\
1998
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2001
}\
2002
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003
    uint8_t halfH[272];\
2004
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2006
}
2007

    
2008
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010
#define op_put(a, b) a = cm[((b) + 16)>>5]
2011
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2012

    
2013
QPEL_MC(0, put_       , _       , op_put)
2014
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015
QPEL_MC(0, avg_       , _       , op_avg)
2016
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2017
#undef op_avg
2018
#undef op_avg_no_rnd
2019
#undef op_put
2020
#undef op_put_no_rnd
2021

    
2022
#if 1
2023
#define H264_LOWPASS(OPNAME, OP, OP2) \
2024
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2025
    const int h=2;\
2026
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2027
    int i;\
2028
    for(i=0; i<h; i++)\
2029
    {\
2030
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2032
        dst+=dstStride;\
2033
        src+=srcStride;\
2034
    }\
2035
}\
2036
\
2037
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2038
    const int w=2;\
2039
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2040
    int i;\
2041
    for(i=0; i<w; i++)\
2042
    {\
2043
        const int srcB= src[-2*srcStride];\
2044
        const int srcA= src[-1*srcStride];\
2045
        const int src0= src[0 *srcStride];\
2046
        const int src1= src[1 *srcStride];\
2047
        const int src2= src[2 *srcStride];\
2048
        const int src3= src[3 *srcStride];\
2049
        const int src4= src[4 *srcStride];\
2050
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2051
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2052
        dst++;\
2053
        src++;\
2054
    }\
2055
}\
2056
\
2057
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2058
    const int h=2;\
2059
    const int w=2;\
2060
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2061
    int i;\
2062
    src -= 2*srcStride;\
2063
    for(i=0; i<h+5; i++)\
2064
    {\
2065
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2067
        tmp+=tmpStride;\
2068
        src+=srcStride;\
2069
    }\
2070
    tmp -= tmpStride*(h+5-2);\
2071
    for(i=0; i<w; i++)\
2072
    {\
2073
        const int tmpB= tmp[-2*tmpStride];\
2074
        const int tmpA= tmp[-1*tmpStride];\
2075
        const int tmp0= tmp[0 *tmpStride];\
2076
        const int tmp1= tmp[1 *tmpStride];\
2077
        const int tmp2= tmp[2 *tmpStride];\
2078
        const int tmp3= tmp[3 *tmpStride];\
2079
        const int tmp4= tmp[4 *tmpStride];\
2080
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2081
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2082
        dst++;\
2083
        tmp++;\
2084
    }\
2085
}\
2086
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2087
    const int h=4;\
2088
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2089
    int i;\
2090
    for(i=0; i<h; i++)\
2091
    {\
2092
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2093
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2094
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2095
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2096
        dst+=dstStride;\
2097
        src+=srcStride;\
2098
    }\
2099
}\
2100
\
2101
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2102
    const int w=4;\
2103
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2104
    int i;\
2105
    for(i=0; i<w; i++)\
2106
    {\
2107
        const int srcB= src[-2*srcStride];\
2108
        const int srcA= src[-1*srcStride];\
2109
        const int src0= src[0 *srcStride];\
2110
        const int src1= src[1 *srcStride];\
2111
        const int src2= src[2 *srcStride];\
2112
        const int src3= src[3 *srcStride];\
2113
        const int src4= src[4 *srcStride];\
2114
        const int src5= src[5 *srcStride];\
2115
        const int src6= src[6 *srcStride];\
2116
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2117
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2118
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2119
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2120
        dst++;\
2121
        src++;\
2122
    }\
2123
}\
2124
\
2125
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2126
    const int h=4;\
2127
    const int w=4;\
2128
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2129
    int i;\
2130
    src -= 2*srcStride;\
2131
    for(i=0; i<h+5; i++)\
2132
    {\
2133
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2135
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2136
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2137
        tmp+=tmpStride;\
2138
        src+=srcStride;\
2139
    }\
2140
    tmp -= tmpStride*(h+5-2);\
2141
    for(i=0; i<w; i++)\
2142
    {\
2143
        const int tmpB= tmp[-2*tmpStride];\
2144
        const int tmpA= tmp[-1*tmpStride];\
2145
        const int tmp0= tmp[0 *tmpStride];\
2146
        const int tmp1= tmp[1 *tmpStride];\
2147
        const int tmp2= tmp[2 *tmpStride];\
2148
        const int tmp3= tmp[3 *tmpStride];\
2149
        const int tmp4= tmp[4 *tmpStride];\
2150
        const int tmp5= tmp[5 *tmpStride];\
2151
        const int tmp6= tmp[6 *tmpStride];\
2152
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2153
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2154
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2155
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2156
        dst++;\
2157
        tmp++;\
2158
    }\
2159
}\
2160
\
2161
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2162
    const int h=8;\
2163
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2164
    int i;\
2165
    for(i=0; i<h; i++)\
2166
    {\
2167
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2168
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2169
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2170
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2171
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2172
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2173
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2174
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2175
        dst+=dstStride;\
2176
        src+=srcStride;\
2177
    }\
2178
}\
2179
\
2180
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2181
    const int w=8;\
2182
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2183
    int i;\
2184
    for(i=0; i<w; i++)\
2185
    {\
2186
        const int srcB= src[-2*srcStride];\
2187
        const int srcA= src[-1*srcStride];\
2188
        const int src0= src[0 *srcStride];\
2189
        const int src1= src[1 *srcStride];\
2190
        const int src2= src[2 *srcStride];\
2191
        const int src3= src[3 *srcStride];\
2192
        const int src4= src[4 *srcStride];\
2193
        const int src5= src[5 *srcStride];\
2194
        const int src6= src[6 *srcStride];\
2195
        const int src7= src[7 *srcStride];\
2196
        const int src8= src[8 *srcStride];\
2197
        const int src9= src[9 *srcStride];\
2198
        const int src10=src[10*srcStride];\
2199
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2200
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2201
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2202
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2203
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2204
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2205
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2206
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2207
        dst++;\
2208
        src++;\
2209
    }\
2210
}\
2211
\
2212
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2213
    const int h=8;\
2214
    const int w=8;\
2215
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2216
    int i;\
2217
    src -= 2*srcStride;\
2218
    for(i=0; i<h+5; i++)\
2219
    {\
2220
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2221
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2222
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2223
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2224
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2225
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2226
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2227
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2228
        tmp+=tmpStride;\
2229
        src+=srcStride;\
2230
    }\
2231
    tmp -= tmpStride*(h+5-2);\
2232
    for(i=0; i<w; i++)\
2233
    {\
2234
        const int tmpB= tmp[-2*tmpStride];\
2235
        const int tmpA= tmp[-1*tmpStride];\
2236
        const int tmp0= tmp[0 *tmpStride];\
2237
        const int tmp1= tmp[1 *tmpStride];\
2238
        const int tmp2= tmp[2 *tmpStride];\
2239
        const int tmp3= tmp[3 *tmpStride];\
2240
        const int tmp4= tmp[4 *tmpStride];\
2241
        const int tmp5= tmp[5 *tmpStride];\
2242
        const int tmp6= tmp[6 *tmpStride];\
2243
        const int tmp7= tmp[7 *tmpStride];\
2244
        const int tmp8= tmp[8 *tmpStride];\
2245
        const int tmp9= tmp[9 *tmpStride];\
2246
        const int tmp10=tmp[10*tmpStride];\
2247
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2248
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2249
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2250
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2251
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2252
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2253
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2254
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2255
        dst++;\
2256
        tmp++;\
2257
    }\
2258
}\
2259
\
2260
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2262
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2263
    src += 8*srcStride;\
2264
    dst += 8*dstStride;\
2265
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2266
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2267
}\
2268
\
2269
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2270
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2271
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2272
    src += 8*srcStride;\
2273
    dst += 8*dstStride;\
2274
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2275
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2276
}\
2277
\
2278
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2279
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2280
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2281
    src += 8*srcStride;\
2282
    dst += 8*dstStride;\
2283
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2284
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2285
}\
2286

    
2287
#define H264_MC(OPNAME, SIZE) \
2288
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2289
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2290
}\
2291
\
2292
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2293
    uint8_t half[SIZE*SIZE];\
2294
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2295
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2296
}\
2297
\
2298
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2299
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2300
}\
2301
\
2302
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2303
    uint8_t half[SIZE*SIZE];\
2304
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2305
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2306
}\
2307
\
2308
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2309
    uint8_t full[SIZE*(SIZE+5)];\
2310
    uint8_t * const full_mid= full + SIZE*2;\
2311
    uint8_t half[SIZE*SIZE];\
2312
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2313
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2314
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2315
}\
2316
\
2317
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2318
    uint8_t full[SIZE*(SIZE+5)];\
2319
    uint8_t * const full_mid= full + SIZE*2;\
2320
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2321
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2322
}\
2323
\
2324
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2325
    uint8_t full[SIZE*(SIZE+5)];\
2326
    uint8_t * const full_mid= full + SIZE*2;\
2327
    uint8_t half[SIZE*SIZE];\
2328
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2329
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2330
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2331
}\
2332
\
2333
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2334
    uint8_t full[SIZE*(SIZE+5)];\
2335
    uint8_t * const full_mid= full + SIZE*2;\
2336
    uint8_t halfH[SIZE*SIZE];\
2337
    uint8_t halfV[SIZE*SIZE];\
2338
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2339
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2340
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2341
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2342
}\
2343
\
2344
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2345
    uint8_t full[SIZE*(SIZE+5)];\
2346
    uint8_t * const full_mid= full + SIZE*2;\
2347
    uint8_t halfH[SIZE*SIZE];\
2348
    uint8_t halfV[SIZE*SIZE];\
2349
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2350
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2351
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2353
}\
2354
\
2355
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2356
    uint8_t full[SIZE*(SIZE+5)];\
2357
    uint8_t * const full_mid= full + SIZE*2;\
2358
    uint8_t halfH[SIZE*SIZE];\
2359
    uint8_t halfV[SIZE*SIZE];\
2360
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2361
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2362
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2363
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2367
    uint8_t full[SIZE*(SIZE+5)];\
2368
    uint8_t * const full_mid= full + SIZE*2;\
2369
    uint8_t halfH[SIZE*SIZE];\
2370
    uint8_t halfV[SIZE*SIZE];\
2371
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2372
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2373
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2374
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2375
}\
2376
\
2377
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2378
    int16_t tmp[SIZE*(SIZE+5)];\
2379
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2380
}\
2381
\
2382
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2383
    int16_t tmp[SIZE*(SIZE+5)];\
2384
    uint8_t halfH[SIZE*SIZE];\
2385
    uint8_t halfHV[SIZE*SIZE];\
2386
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2387
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2389
}\
2390
\
2391
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2392
    int16_t tmp[SIZE*(SIZE+5)];\
2393
    uint8_t halfH[SIZE*SIZE];\
2394
    uint8_t halfHV[SIZE*SIZE];\
2395
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2396
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2397
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2398
}\
2399
\
2400
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2401
    uint8_t full[SIZE*(SIZE+5)];\
2402
    uint8_t * const full_mid= full + SIZE*2;\
2403
    int16_t tmp[SIZE*(SIZE+5)];\
2404
    uint8_t halfV[SIZE*SIZE];\
2405
    uint8_t halfHV[SIZE*SIZE];\
2406
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2407
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2408
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2409
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2410
}\
2411
\
2412
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2413
    uint8_t full[SIZE*(SIZE+5)];\
2414
    uint8_t * const full_mid= full + SIZE*2;\
2415
    int16_t tmp[SIZE*(SIZE+5)];\
2416
    uint8_t halfV[SIZE*SIZE];\
2417
    uint8_t halfHV[SIZE*SIZE];\
2418
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2419
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2421
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2422
}\
2423

    
2424
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2425
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2426
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2427
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2428
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2429

    
2430
H264_LOWPASS(put_       , op_put, op2_put)
2431
H264_LOWPASS(avg_       , op_avg, op2_avg)
2432
H264_MC(put_, 2)
2433
H264_MC(put_, 4)
2434
H264_MC(put_, 8)
2435
H264_MC(put_, 16)
2436
H264_MC(avg_, 4)
2437
H264_MC(avg_, 8)
2438
H264_MC(avg_, 16)
2439

    
2440
#undef op_avg
2441
#undef op_put
2442
#undef op2_avg
2443
#undef op2_put
2444
#endif
2445

    
2446
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2447
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2448
#define H264_WEIGHT(W,H) \
2449
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2450
    int y; \
2451
    offset <<= log2_denom; \
2452
    if(log2_denom) offset += 1<<(log2_denom-1); \
2453
    for(y=0; y<H; y++, block += stride){ \
2454
        op_scale1(0); \
2455
        op_scale1(1); \
2456
        if(W==2) continue; \
2457
        op_scale1(2); \
2458
        op_scale1(3); \
2459
        if(W==4) continue; \
2460
        op_scale1(4); \
2461
        op_scale1(5); \
2462
        op_scale1(6); \
2463
        op_scale1(7); \
2464
        if(W==8) continue; \
2465
        op_scale1(8); \
2466
        op_scale1(9); \
2467
        op_scale1(10); \
2468
        op_scale1(11); \
2469
        op_scale1(12); \
2470
        op_scale1(13); \
2471
        op_scale1(14); \
2472
        op_scale1(15); \
2473
    } \
2474
} \
2475
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2476
    int y; \
2477
    offset = ((offset + 1) | 1) << log2_denom; \
2478
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2479
        op_scale2(0); \
2480
        op_scale2(1); \
2481
        if(W==2) continue; \
2482
        op_scale2(2); \
2483
        op_scale2(3); \
2484
        if(W==4) continue; \
2485
        op_scale2(4); \
2486
        op_scale2(5); \
2487
        op_scale2(6); \
2488
        op_scale2(7); \
2489
        if(W==8) continue; \
2490
        op_scale2(8); \
2491
        op_scale2(9); \
2492
        op_scale2(10); \
2493
        op_scale2(11); \
2494
        op_scale2(12); \
2495
        op_scale2(13); \
2496
        op_scale2(14); \
2497
        op_scale2(15); \
2498
    } \
2499
}
2500

    
2501
H264_WEIGHT(16,16)
2502
H264_WEIGHT(16,8)
2503
H264_WEIGHT(8,16)
2504
H264_WEIGHT(8,8)
2505
H264_WEIGHT(8,4)
2506
H264_WEIGHT(4,8)
2507
H264_WEIGHT(4,4)
2508
H264_WEIGHT(4,2)
2509
H264_WEIGHT(2,4)
2510
H264_WEIGHT(2,2)
2511

    
2512
#undef op_scale1
2513
#undef op_scale2
2514
#undef H264_WEIGHT
2515

    
2516
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2517
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2518
    int i;
2519

    
2520
    for(i=0; i<h; i++){
2521
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2522
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2523
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2524
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2525
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2526
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2527
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2528
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2529
        dst+=dstStride;
2530
        src+=srcStride;
2531
    }
2532
}
2533

    
2534
#ifdef CONFIG_CAVS_DECODER
2535
/* AVS specific */
2536
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2537

    
2538
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539
    put_pixels8_c(dst, src, stride, 8);
2540
}
2541
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2542
    avg_pixels8_c(dst, src, stride, 8);
2543
}
2544
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2545
    put_pixels16_c(dst, src, stride, 16);
2546
}
2547
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2548
    avg_pixels16_c(dst, src, stride, 16);
2549
}
2550
#endif /* CONFIG_CAVS_DECODER */
2551

    
2552
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2553
/* VC-1 specific */
2554
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2555

    
2556
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2557
    put_pixels8_c(dst, src, stride, 8);
2558
}
2559
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2560

    
2561
#if defined(CONFIG_H264_ENCODER)
2562
/* H264 specific */
2563
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2564
#endif /* CONFIG_H264_ENCODER */
2565

    
2566
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2567
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2568
    int i;
2569

    
2570
    for(i=0; i<w; i++){
2571
        const int src_1= src[ -srcStride];
2572
        const int src0 = src[0          ];
2573
        const int src1 = src[  srcStride];
2574
        const int src2 = src[2*srcStride];
2575
        const int src3 = src[3*srcStride];
2576
        const int src4 = src[4*srcStride];
2577
        const int src5 = src[5*srcStride];
2578
        const int src6 = src[6*srcStride];
2579
        const int src7 = src[7*srcStride];
2580
        const int src8 = src[8*srcStride];
2581
        const int src9 = src[9*srcStride];
2582
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2583
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2584
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2585
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2586
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2587
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2588
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2589
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2590
        src++;
2591
        dst++;
2592
    }
2593
}
2594

    
2595
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2596
    put_pixels8_c(dst, src, stride, 8);
2597
}
2598

    
2599
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2600
    uint8_t half[64];
2601
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2602
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2603
}
2604

    
2605
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2606
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2607
}
2608

    
2609
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2610
    uint8_t half[64];
2611
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2613
}
2614

    
2615
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2616
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2617
}
2618

    
2619
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2620
    uint8_t halfH[88];
2621
    uint8_t halfV[64];
2622
    uint8_t halfHV[64];
2623
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2625
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2627
}
2628
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2629
    uint8_t halfH[88];
2630
    uint8_t halfV[64];
2631
    uint8_t halfHV[64];
2632
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2633
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2634
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2635
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2636
}
2637
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2638
    uint8_t halfH[88];
2639
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2641
}
2642

    
2643
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2644
    if(ENABLE_ANY_H263) {
2645
    int x;
2646
    const int strength= ff_h263_loop_filter_strength[qscale];
2647

    
2648
    for(x=0; x<8; x++){
2649
        int d1, d2, ad1;
2650
        int p0= src[x-2*stride];
2651
        int p1= src[x-1*stride];
2652
        int p2= src[x+0*stride];
2653
        int p3= src[x+1*stride];
2654
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2655

    
2656
        if     (d<-2*strength) d1= 0;
2657
        else if(d<-  strength) d1=-2*strength - d;
2658
        else if(d<   strength) d1= d;
2659
        else if(d< 2*strength) d1= 2*strength - d;
2660
        else                   d1= 0;
2661

    
2662
        p1 += d1;
2663
        p2 -= d1;
2664
        if(p1&256) p1= ~(p1>>31);
2665
        if(p2&256) p2= ~(p2>>31);
2666

    
2667
        src[x-1*stride] = p1;
2668
        src[x+0*stride] = p2;
2669

    
2670
        ad1= FFABS(d1)>>1;
2671

    
2672
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2673

    
2674
        src[x-2*stride] = p0 - d2;
2675
        src[x+  stride] = p3 + d2;
2676
    }
2677
    }
2678
}
2679

    
2680
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2681
    if(ENABLE_ANY_H263) {
2682
    int y;
2683
    const int strength= ff_h263_loop_filter_strength[qscale];
2684

    
2685
    for(y=0; y<8; y++){
2686
        int d1, d2, ad1;
2687
        int p0= src[y*stride-2];
2688
        int p1= src[y*stride-1];
2689
        int p2= src[y*stride+0];
2690
        int p3= src[y*stride+1];
2691
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2692

    
2693
        if     (d<-2*strength) d1= 0;
2694
        else if(d<-  strength) d1=-2*strength - d;
2695
        else if(d<   strength) d1= d;
2696
        else if(d< 2*strength) d1= 2*strength - d;
2697
        else                   d1= 0;
2698

    
2699
        p1 += d1;
2700
        p2 -= d1;
2701
        if(p1&256) p1= ~(p1>>31);
2702
        if(p2&256) p2= ~(p2>>31);
2703

    
2704
        src[y*stride-1] = p1;
2705
        src[y*stride+0] = p2;
2706

    
2707
        ad1= FFABS(d1)>>1;
2708

    
2709
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2710

    
2711
        src[y*stride-2] = p0 - d2;
2712
        src[y*stride+1] = p3 + d2;
2713
    }
2714
    }
2715
}
2716

    
2717
static void h261_loop_filter_c(uint8_t *src, int stride){
2718
    int x,y,xy,yz;
2719
    int temp[64];
2720

    
2721
    for(x=0; x<8; x++){
2722
        temp[x      ] = 4*src[x           ];
2723
        temp[x + 7*8] = 4*src[x + 7*stride];
2724
    }
2725
    for(y=1; y<7; y++){
2726
        for(x=0; x<8; x++){
2727
            xy = y * stride + x;
2728
            yz = y * 8 + x;
2729
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2730
        }
2731
    }
2732

    
2733
    for(y=0; y<8; y++){
2734
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2735
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2736
        for(x=1; x<7; x++){
2737
            xy = y * stride + x;
2738
            yz = y * 8 + x;
2739
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2740
        }
2741
    }
2742
}
2743

    
2744
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2745
{
2746
    int i, d;
2747
    for( i = 0; i < 4; i++ ) {
2748
        if( tc0[i] < 0 ) {
2749
            pix += 4*ystride;
2750
            continue;
2751
        }
2752
        for( d = 0; d < 4; d++ ) {
2753
            const int p0 = pix[-1*xstride];
2754
            const int p1 = pix[-2*xstride];
2755
            const int p2 = pix[-3*xstride];
2756
            const int q0 = pix[0];
2757
            const int q1 = pix[1*xstride];
2758
            const int q2 = pix[2*xstride];
2759

    
2760
            if( FFABS( p0 - q0 ) < alpha &&
2761
                FFABS( p1 - p0 ) < beta &&
2762
                FFABS( q1 - q0 ) < beta ) {
2763

    
2764
                int tc = tc0[i];
2765
                int i_delta;
2766

    
2767
                if( FFABS( p2 - p0 ) < beta ) {
2768
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2769
                    tc++;
2770
                }
2771
                if( FFABS( q2 - q0 ) < beta ) {
2772
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2773
                    tc++;
2774
                }
2775

    
2776
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2777
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2778
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2779
            }
2780
            pix += ystride;
2781
        }
2782
    }
2783
}
2784
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2785
{
2786
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2787
}
2788
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2789
{
2790
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2791
}
2792

    
2793
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2794
{
2795
    int i, d;
2796
    for( i = 0; i < 4; i++ ) {
2797
        const int tc = tc0[i];
2798
        if( tc <= 0 ) {
2799
            pix += 2*ystride;
2800
            continue;
2801
        }
2802
        for( d = 0; d < 2; d++ ) {
2803
            const int p0 = pix[-1*xstride];
2804
            const int p1 = pix[-2*xstride];
2805
            const int q0 = pix[0];
2806
            const int q1 = pix[1*xstride];
2807

    
2808
            if( FFABS( p0 - q0 ) < alpha &&
2809
                FFABS( p1 - p0 ) < beta &&
2810
                FFABS( q1 - q0 ) < beta ) {
2811

    
2812
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2813

    
2814
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2815
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2816
            }
2817
            pix += ystride;
2818
        }
2819
    }
2820
}
2821
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2822
{
2823
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2824
}
2825
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2826
{
2827
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2828
}
2829

    
2830
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2831
{
2832
    int d;
2833
    for( d = 0; d < 8; d++ ) {
2834
        const int p0 = pix[-1*xstride];
2835
        const int p1 = pix[-2*xstride];
2836
        const int q0 = pix[0];
2837
        const int q1 = pix[1*xstride];
2838

    
2839
        if( FFABS( p0 - q0 ) < alpha &&
2840
            FFABS( p1 - p0 ) < beta &&
2841
            FFABS( q1 - q0 ) < beta ) {
2842

    
2843
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2844
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2845
        }
2846
        pix += ystride;
2847
    }
2848
}
2849
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2850
{
2851
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2852
}
2853
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2854
{
2855
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2856
}
2857

    
2858
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2859
{
2860
    int s, i;
2861

    
2862
    s = 0;
2863
    for(i=0;i<h;i++) {
2864
        s += abs(pix1[0] - pix2[0]);
2865
        s += abs(pix1[1] - pix2[1]);
2866
        s += abs(pix1[2] - pix2[2]);
2867
        s += abs(pix1[3] - pix2[3]);
2868
        s += abs(pix1[4] - pix2[4]);
2869
        s += abs(pix1[5] - pix2[5]);
2870
        s += abs(pix1[6] - pix2[6]);
2871
        s += abs(pix1[7] - pix2[7]);
2872
        s += abs(pix1[8] - pix2[8]);
2873
        s += abs(pix1[9] - pix2[9]);
2874
        s += abs(pix1[10] - pix2[10]);
2875
        s += abs(pix1[11] - pix2[11]);
2876
        s += abs(pix1[12] - pix2[12]);
2877
        s += abs(pix1[13] - pix2[13]);
2878
        s += abs(pix1[14] - pix2[14]);
2879
        s += abs(pix1[15] - pix2[15]);
2880
        pix1 += line_size;
2881
        pix2 += line_size;
2882
    }
2883
    return s;
2884
}
2885

    
2886
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2887
{
2888
    int s, i;
2889

    
2890
    s = 0;
2891
    for(i=0;i<h;i++) {
2892
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2893
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2894
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2895
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2896
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2897
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2898
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2899
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2900
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2901
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2902
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2903
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2904
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2905
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2906
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2907
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2908
        pix1 += line_size;
2909
        pix2 += line_size;
2910
    }
2911
    return s;
2912
}
2913

    
2914
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2915
{
2916
    int s, i;
2917
    uint8_t *pix3 = pix2 + line_size;
2918

    
2919
    s = 0;
2920
    for(i=0;i<h;i++) {
2921
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2922
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2923
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2924
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2925
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2926
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2927
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2928
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2929
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2930
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2931
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2932
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2933
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2934
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2935
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2936
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2937
        pix1 += line_size;
2938
        pix2 += line_size;
2939
        pix3 += line_size;
2940
    }
2941
    return s;
2942
}
2943

    
2944
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2945
{
2946
    int s, i;
2947
    uint8_t *pix3 = pix2 + line_size;
2948

    
2949
    s = 0;
2950
    for(i=0;i<h;i++) {
2951
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2952
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2953
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2954
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2955
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2956
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2957
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2958
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2959
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2960
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2961
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2962
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2963
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2964
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2965
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2966
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2967
        pix1 += line_size;
2968
        pix2 += line_size;
2969
        pix3 += line_size;
2970
    }
2971
    return s;
2972
}
2973

    
2974
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2975
{
2976
    int s, i;
2977

    
2978
    s = 0;
2979
    for(i=0;i<h;i++) {
2980
        s += abs(pix1[0] - pix2[0]);
2981
        s += abs(pix1[1] - pix2[1]);
2982
        s += abs(pix1[2] - pix2[2]);
2983
        s += abs(pix1[3] - pix2[3]);
2984
        s += abs(pix1[4] - pix2[4]);
2985
        s += abs(pix1[5] - pix2[5]);
2986
        s += abs(pix1[6] - pix2[6]);
2987
        s += abs(pix1[7] - pix2[7]);
2988
        pix1 += line_size;
2989
        pix2 += line_size;
2990
    }
2991
    return s;
2992
}
2993

    
2994
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2995
{
2996
    int s, i;
2997

    
2998
    s = 0;
2999
    for(i=0;i<h;i++) {
3000
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3001
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3002
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3003
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3004
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3005
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3006
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3007
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3008
        pix1 += line_size;
3009
        pix2 += line_size;
3010
    }
3011
    return s;
3012
}
3013

    
3014
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3015
{
3016
    int s, i;
3017
    uint8_t *pix3 = pix2 + line_size;
3018

    
3019
    s = 0;
3020
    for(i=0;i<h;i++) {
3021
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3022
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3023
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3024
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3025
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3026
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3027
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3028
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3029
        pix1 += line_size;
3030
        pix2 += line_size;
3031
        pix3 += line_size;
3032
    }
3033
    return s;
3034
}
3035

    
3036
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3037
{
3038
    int s, i;
3039
    uint8_t *pix3 = pix2 + line_size;
3040

    
3041
    s = 0;
3042
    for(i=0;i<h;i++) {
3043
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3044
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3045
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3046
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3047
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3048
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3049
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3050
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3051
        pix1 += line_size;
3052
        pix2 += line_size;
3053
        pix3 += line_size;
3054
    }
3055
    return s;
3056
}
3057

    
3058
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3059
    MpegEncContext *c = v;
3060
    int score1=0;
3061
    int score2=0;
3062
    int x,y;
3063

    
3064
    for(y=0; y<h; y++){
3065
        for(x=0; x<16; x++){
3066
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3067
        }
3068
        if(y+1<h){
3069
            for(x=0; x<15; x++){
3070
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3071
                             - s1[x+1] + s1[x+1+stride])
3072
                        -FFABS(  s2[x  ] - s2[x  +stride]
3073
                             - s2[x+1] + s2[x+1+stride]);
3074
            }
3075
        }
3076
        s1+= stride;
3077
        s2+= stride;
3078
    }
3079

    
3080
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3081
    else  return score1 + FFABS(score2)*8;
3082
}
3083

    
3084
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3085
    MpegEncContext *c = v;
3086
    int score1=0;
3087
    int score2=0;
3088
    int x,y;
3089

    
3090
    for(y=0; y<h; y++){
3091
        for(x=0; x<8; x++){
3092
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3093
        }
3094
        if(y+1<h){
3095
            for(x=0; x<7; x++){
3096
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3097
                             - s1[x+1] + s1[x+1+stride])
3098
                        -FFABS(  s2[x  ] - s2[x  +stride]
3099
                             - s2[x+1] + s2[x+1+stride]);
3100
            }
3101
        }
3102
        s1+= stride;
3103
        s2+= stride;
3104
    }
3105

    
3106
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3107
    else  return score1 + FFABS(score2)*8;
3108
}
3109

    
3110
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3111
    int i;
3112
    unsigned int sum=0;
3113

    
3114
    for(i=0; i<8*8; i++){
3115
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3116
        int w= weight[i];
3117
        b>>= RECON_SHIFT;
3118
        assert(-512<b && b<512);
3119

    
3120
        sum += (w*b)*(w*b)>>4;
3121
    }
3122
    return sum>>2;
3123
}
3124

    
3125
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3126
    int i;
3127

    
3128
    for(i=0; i<8*8; i++){
3129
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3130
    }
3131
}
3132

    
3133
/**
3134
 * permutes an 8x8 block.
3135
 * @param block the block which will be permuted according to the given permutation vector
3136
 * @param permutation the permutation vector
3137
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3138
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3139
 *                  (inverse) permutated to scantable order!
3140
 */
3141
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3142
{
3143
    int i;
3144
    DCTELEM temp[64];
3145

    
3146
    if(last<=0) return;
3147
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3148

    
3149
    for(i=0; i<=last; i++){
3150
        const int j= scantable[i];
3151
        temp[j]= block[j];
3152
        block[j]=0;
3153
    }
3154

    
3155
    for(i=0; i<=last; i++){
3156
        const int j= scantable[i];
3157
        const int perm_j= permutation[j];
3158
        block[perm_j]= temp[j];
3159
    }
3160
}
3161

    
3162
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3163
    return 0;
3164
}
3165

    
3166
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3167
    int i;
3168

    
3169
    memset(cmp, 0, sizeof(void*)*5);
3170

    
3171
    for(i=0; i<5; i++){
3172
        switch(type&0xFF){
3173
        case FF_CMP_SAD:
3174
            cmp[i]= c->sad[i];
3175
            break;
3176
        case FF_CMP_SATD:
3177
            cmp[i]= c->hadamard8_diff[i];
3178
            break;
3179
        case FF_CMP_SSE:
3180
            cmp[i]= c->sse[i];
3181
            break;
3182
        case FF_CMP_DCT:
3183
            cmp[i]= c->dct_sad[i];
3184
            break;
3185
        case FF_CMP_DCT264:
3186
            cmp[i]= c->dct264_sad[i];
3187
            break;
3188
        case FF_CMP_DCTMAX:
3189
            cmp[i]= c->dct_max[i];
3190
            break;
3191
        case FF_CMP_PSNR:
3192
            cmp[i]= c->quant_psnr[i];
3193
            break;
3194
        case FF_CMP_BIT:
3195
            cmp[i]= c->bit[i];
3196
            break;
3197
        case FF_CMP_RD:
3198
            cmp[i]= c->rd[i];
3199
            break;
3200
        case FF_CMP_VSAD:
3201
            cmp[i]= c->vsad[i];
3202
            break;
3203
        case FF_CMP_VSSE:
3204
            cmp[i]= c->vsse[i];
3205
            break;
3206
        case FF_CMP_ZERO:
3207
            cmp[i]= zero_cmp;
3208
            break;
3209
        case FF_CMP_NSSE:
3210
            cmp[i]= c->nsse[i];
3211
            break;
3212
#ifdef CONFIG_SNOW_ENCODER
3213
        case FF_CMP_W53:
3214
            cmp[i]= c->w53[i];
3215
            break;
3216
        case FF_CMP_W97:
3217
            cmp[i]= c->w97[i];
3218
            break;
3219
#endif
3220
        default:
3221
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3222
        }
3223
    }
3224
}
3225

    
3226
/**
3227
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3228
 */
3229
static void clear_blocks_c(DCTELEM *blocks)
3230
{
3231
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3232
}
3233

    
3234
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3235
    int i;
3236
    for(i=0; i+7<w; i+=8){
3237
        dst[i+0] += src[i+0];
3238
        dst[i+1] += src[i+1];
3239
        dst[i+2] += src[i+2];
3240
        dst[i+3] += src[i+3];
3241
        dst[i+4] += src[i+4];
3242
        dst[i+5] += src[i+5];
3243
        dst[i+6] += src[i+6];
3244
        dst[i+7] += src[i+7];
3245
    }
3246
    for(; i<w; i++)
3247
        dst[i+0] += src[i+0];
3248
}
3249

    
3250
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3251
    int i;
3252
    for(i=0; i+7<w; i+=8){
3253
        dst[i+0] = src1[i+0]-src2[i+0];
3254
        dst[i+1] = src1[i+1]-src2[i+1];
3255
        dst[i+2] = src1[i+2]-src2[i+2];
3256
        dst[i+3] = src1[i+3]-src2[i+3];
3257
        dst[i+4] = src1[i+4]-src2[i+4];
3258
        dst[i+5] = src1[i+5]-src2[i+5];
3259
        dst[i+6] = src1[i+6]-src2[i+6];
3260
        dst[i+7] = src1[i+7]-src2[i+7];
3261
    }
3262
    for(; i<w; i++)
3263
        dst[i+0] = src1[i+0]-src2[i+0];
3264
}
3265

    
3266
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3267
    int i;
3268
    uint8_t l, lt;
3269

    
3270
    l= *left;
3271
    lt= *left_top;
3272

    
3273
    for(i=0; i<w; i++){
3274
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3275
        lt= src1[i];
3276
        l= src2[i];
3277
        dst[i]= l - pred;
3278
    }
3279

    
3280
    *left= l;
3281
    *left_top= lt;
3282
}
3283

    
3284
#define BUTTERFLY2(o1,o2,i1,i2) \
3285
o1= (i1)+(i2);\
3286
o2= (i1)-(i2);
3287

    
3288
#define BUTTERFLY1(x,y) \
3289
{\
3290
    int a,b;\
3291
    a= x;\
3292
    b= y;\
3293
    x= a+b;\
3294
    y= a-b;\
3295
}
3296

    
3297
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3298

    
3299
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3300
    int i;
3301
    int temp[64];
3302
    int sum=0;
3303

    
3304
    assert(h==8);
3305

    
3306
    for(i=0; i<8; i++){
3307
        //FIXME try pointer walks
3308
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3309
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3310
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3311
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3312

    
3313
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3314
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3315
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3316
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3317

    
3318
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3319
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3320
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3321
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3322
    }
3323

    
3324
    for(i=0; i<8; i++){
3325
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3326
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3327
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3328
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3329

    
3330
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3331
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3332
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3333
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3334

    
3335
        sum +=
3336
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3337
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3338
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3339
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3340
    }
3341
#if 0
3342
static int maxi=0;
3343
if(sum>maxi){
3344
    maxi=sum;
3345
    printf("MAX:%d\n", maxi);
3346
}
3347
#endif
3348
    return sum;
3349
}
3350

    
3351
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3352
    int i;
3353
    int temp[64];
3354
    int sum=0;
3355

    
3356
    assert(h==8);
3357

    
3358
    for(i=0; i<8; i++){
3359
        //FIXME try pointer walks
3360
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3361
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3362
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3363
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3364

    
3365
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3366
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3367
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3368
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3369

    
3370
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3371
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3372
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3373
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3374
    }
3375

    
3376
    for(i=0; i<8; i++){
3377
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3378
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3379
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3380
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3381

    
3382
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3383
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3384
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3385
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3386

    
3387
        sum +=
3388
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3389
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3390
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3391
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3392
    }
3393

    
3394
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3395

    
3396
    return sum;
3397
}
3398

    
3399
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3400
    MpegEncContext * const s= (MpegEncContext *)c;
3401
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3402
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3403

    
3404
    assert(h==8);
3405

    
3406
    s->dsp.diff_pixels(temp, src1, src2, stride);
3407
    s->dsp.fdct(temp);
3408
    return s->dsp.sum_abs_dctelem(temp);
3409
}
3410

    
3411
#ifdef CONFIG_GPL
3412
#define DCT8_1D {\
3413
    const int s07 = SRC(0) + SRC(7);\
3414
    const int s16 = SRC(1) + SRC(6);\
3415
    const int s25 = SRC(2) + SRC(5);\
3416
    const int s34 = SRC(3) + SRC(4);\
3417
    const int a0 = s07 + s34;\
3418
    const int a1 = s16 + s25;\
3419
    const int a2 = s07 - s34;\
3420
    const int a3 = s16 - s25;\
3421
    const int d07 = SRC(0) - SRC(7);\
3422
    const int d16 = SRC(1) - SRC(6);\
3423
    const int d25 = SRC(2) - SRC(5);\
3424
    const int d34 = SRC(3) - SRC(4);\
3425
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3426
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3427
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3428
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3429
    DST(0,  a0 + a1     ) ;\
3430
    DST(1,  a4 + (a7>>2)) ;\
3431
    DST(2,  a2 + (a3>>1)) ;\
3432
    DST(3,  a5 + (a6>>2)) ;\
3433
    DST(4,  a0 - a1     ) ;\
3434
    DST(5,  a6 - (a5>>2)) ;\
3435
    DST(6, (a2>>1) - a3 ) ;\
3436
    DST(7, (a4>>2) - a7 ) ;\
3437
}
3438

    
3439
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3440
    MpegEncContext * const s= (MpegEncContext *)c;
3441
    DCTELEM dct[8][8];
3442
    int i;
3443
    int sum=0;
3444

    
3445
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3446

    
3447
#define SRC(x) dct[i][x]
3448
#define DST(x,v) dct[i][x]= v
3449
    for( i = 0; i < 8; i++ )
3450
        DCT8_1D
3451
#undef SRC
3452
#undef DST
3453

    
3454
#define SRC(x) dct[x][i]
3455
#define DST(x,v) sum += FFABS(v)
3456
    for( i = 0; i < 8; i++ )
3457
        DCT8_1D
3458
#undef SRC
3459
#undef DST
3460
    return sum;
3461
}
3462
#endif
3463

    
3464
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3465
    MpegEncContext * const s= (MpegEncContext *)c;
3466
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3467
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3468
    int sum=0, i;
3469

    
3470
    assert(h==8);
3471

    
3472
    s->dsp.diff_pixels(temp, src1, src2, stride);
3473
    s->dsp.fdct(temp);
3474

    
3475
    for(i=0; i<64; i++)
3476
        sum= FFMAX(sum, FFABS(temp[i]));
3477

    
3478
    return sum;
3479
}
3480

    
3481
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3482
    MpegEncContext * const s= (MpegEncContext *)c;
3483
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3484
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3485
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3486
    int sum=0, i;
3487

    
3488
    assert(h==8);
3489
    s->mb_intra=0;
3490

    
3491
    s->dsp.diff_pixels(temp, src1, src2, stride);
3492

    
3493
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3494

    
3495
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3496
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3497
    simple_idct(temp); //FIXME
3498

    
3499
    for(i=0; i<64; i++)
3500
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3501

    
3502
    return sum;
3503
}
3504

    
3505
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3506
    MpegEncContext * const s= (MpegEncContext *)c;
3507
    const uint8_t *scantable= s->intra_scantable.permutated;
3508
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3509
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3510
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3511
    uint8_t * const bak= (uint8_t*)aligned_bak;
3512
    int i, last, run, bits, level, distoration, start_i;
3513
    const int esc_length= s->ac_esc_length;
3514
    uint8_t * length;
3515
    uint8_t * last_length;
3516

    
3517
    assert(h==8);
3518

    
3519
    for(i=0; i<8; i++){
3520
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3521
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3522
    }
3523

    
3524
    s->dsp.diff_pixels(temp, src1, src2, stride);
3525

    
3526
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3527

    
3528
    bits=0;
3529

    
3530
    if (s->mb_intra) {
3531
        start_i = 1;
3532
        length     = s->intra_ac_vlc_length;
3533
        last_length= s->intra_ac_vlc_last_length;
3534
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3535
    } else {
3536
        start_i = 0;
3537
        length     = s->inter_ac_vlc_length;
3538
        last_length= s->inter_ac_vlc_last_length;
3539
    }
3540

    
3541
    if(last>=start_i){
3542
        run=0;
3543
        for(i=start_i; i<last; i++){
3544
            int j= scantable[i];
3545
            level= temp[j];
3546

    
3547
            if(level){
3548
                level+=64;
3549
                if((level&(~127)) == 0){
3550
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3551
                }else
3552
                    bits+= esc_length;
3553
                run=0;
3554
            }else
3555
                run++;
3556
        }
3557
        i= scantable[last];
3558

    
3559
        level= temp[i] + 64;
3560

    
3561
        assert(level - 64);
3562

    
3563
        if((level&(~127)) == 0){
3564
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3565
        }else
3566
            bits+= esc_length;
3567

    
3568
    }
3569

    
3570
    if(last>=0){
3571
        if(s->mb_intra)
3572
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3573
        else
3574
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3575
    }
3576

    
3577
    s->dsp.idct_add(bak, stride, temp);
3578

    
3579
    distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3580

    
3581
    return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3582
}
3583

    
3584
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3585
    MpegEncContext * const s= (MpegEncContext *)c;
3586
    const uint8_t *scantable= s->intra_scantable.permutated;
3587
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3588
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3589
    int i, last, run, bits, level, start_i;
3590
    const int esc_length= s->ac_esc_length;
3591
    uint8_t * length;
3592
    uint8_t * last_length;
3593

    
3594
    assert(h==8);
3595

    
3596
    s->dsp.diff_pixels(temp, src1, src2, stride);
3597

    
3598
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3599

    
3600
    bits=0;
3601

    
3602
    if (s->mb_intra) {
3603
        start_i = 1;
3604
        length     = s->intra_ac_vlc_length;
3605
        last_length= s->intra_ac_vlc_last_length;
3606
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3607
    } else {
3608
        start_i = 0;
3609
        length     = s->inter_ac_vlc_length;
3610
        last_length= s->inter_ac_vlc_last_length;
3611
    }
3612

    
3613
    if(last>=start_i){
3614
        run=0;
3615
        for(i=start_i; i<last; i++){
3616
            int j= scantable[i];
3617
            level= temp[j];
3618

    
3619
            if(level){
3620
                level+=64;
3621
                if((level&(~127)) == 0){
3622
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3623
                }else
3624
                    bits+= esc_length;
3625
                run=0;
3626
            }else
3627
                run++;
3628
        }
3629
        i= scantable[last];
3630

    
3631
        level= temp[i] + 64;
3632

    
3633
        assert(level - 64);
3634

    
3635
        if((level&(~127)) == 0){
3636
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3637
        }else
3638
            bits+= esc_length;
3639
    }
3640

    
3641
    return bits;
3642
}
3643

    
3644
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3645
    int score=0;
3646
    int x,y;
3647

    
3648
    for(y=1; y<h; y++){
3649
        for(x=0; x<16; x+=4){
3650
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3651
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3652
        }
3653
        s+= stride;
3654
    }
3655

    
3656
    return score;
3657
}
3658

    
3659
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3660
    int score=0;
3661
    int x,y;
3662

    
3663
    for(y=1; y<h; y++){
3664
        for(x=0; x<16; x++){
3665
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3666
        }
3667
        s1+= stride;
3668
        s2+= stride;
3669
    }
3670

    
3671
    return score;
3672
}
3673

    
3674
#define SQ(a) ((a)*(a))
3675
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3676
    int score=0;
3677
    int x,y;
3678

    
3679
    for(y=1; y<h; y++){
3680
        for(x=0; x<16; x+=4){
3681
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3682
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3683
        }
3684
        s+= stride;
3685
    }
3686

    
3687
    return score;
3688
}
3689

    
3690
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3691
    int score=0;
3692
    int x,y;
3693

    
3694
    for(y=1; y<h; y++){
3695
        for(x=0; x<16; x++){
3696
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3697
        }
3698
        s1+= stride;
3699
        s2+= stride;
3700
    }
3701

    
3702
    return score;
3703
}
3704

    
3705
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3706
                               int size){
3707
    int score=0;
3708
    int i;
3709
    for(i=0; i<size; i++)
3710
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3711
    return score;
3712
}
3713

    
3714
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3715
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3716
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3717
#ifdef CONFIG_GPL
3718
WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3719
#endif
3720
WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3721
WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3722
WARPER8_16_SQ(rd8x8_c, rd16_c)
3723
WARPER8_16_SQ(bit8x8_c, bit16_c)
3724

    
3725
static void vector_fmul_c(float *dst, const float *src, int len){
3726
    int i;
3727
    for(i=0; i<len; i++)
3728
        dst[i] *= src[i];
3729
}
3730

    
3731
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3732
    int i;
3733
    src1 += len-1;
3734
    for(i=0; i<len; i++)
3735
        dst[i] = src0[i] * src1[-i];
3736
}
3737

    
3738
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3739
    int i;
3740
    for(i=0; i<len; i++)
3741
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3742
}
3743

    
3744
void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3745
    int i;
3746
    for(i=0; i<len; i++) {
3747
        int_fast32_t tmp = ((int32_t*)src)[i];
3748
        if(tmp & 0xf0000){
3749
            tmp = (0x43c0ffff - tmp)>>31;
3750
            // is this faster on some gcc/cpu combinations?
3751
//          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3752
//          else                 tmp = 0;
3753
        }
3754
        dst[i] = tmp - 0x8000;
3755
    }
3756
}
3757

    
3758
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3759
 converted */
3760
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3761
{
3762
    j_rev_dct (block);
3763
    put_pixels_clamped_c(block, dest, line_size);
3764
}
3765
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3766
{
3767
    j_rev_dct (block);
3768
    add_pixels_clamped_c(block, dest, line_size);
3769
}
3770

    
3771
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3772
{
3773
    j_rev_dct4 (block);
3774
    put_pixels_clamped4_c(block, dest, line_size);
3775
}
3776
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3777
{
3778
    j_rev_dct4 (block);
3779
    add_pixels_clamped4_c(block, dest, line_size);
3780
}
3781

    
3782
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3783
{
3784
    j_rev_dct2 (block);
3785
    put_pixels_clamped2_c(block, dest, line_size);
3786
}
3787
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3788
{
3789
    j_rev_dct2 (block);
3790
    add_pixels_clamped2_c(block, dest, line_size);
3791
}
3792

    
3793
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3794
{
3795
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3796

    
3797
    dest[0] = cm[(block[0] + 4)>>3];
3798
}
3799
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3800
{
3801
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3802

    
3803
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3804
}
3805

    
3806
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3807

    
3808
/* init static data */
3809
void dsputil_static_init(void)
3810
{
3811
    int i;
3812

    
3813
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3814
    for(i=0;i<MAX_NEG_CROP;i++) {
3815
        ff_cropTbl[i] = 0;
3816
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3817
    }
3818

    
3819
    for(i=0;i<512;i++) {
3820
        ff_squareTbl[i] = (i - 256) * (i - 256);
3821
    }
3822

    
3823
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3824
}
3825

    
3826
int ff_check_alignment(void){
3827
    static int did_fail=0;
3828
    DECLARE_ALIGNED_16(int, aligned);
3829

    
3830
    if((long)&aligned & 15){
3831
        if(!did_fail){
3832
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3833
            av_log(NULL, AV_LOG_ERROR,
3834
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3835
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
3836
                "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3837
#endif
3838
            did_fail=1;
3839
        }
3840
        return -1;
3841
    }
3842
    return 0;
3843
}
3844

    
3845
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3846
{
3847
    int i;
3848

    
3849
    ff_check_alignment();
3850

    
3851
#ifdef CONFIG_ENCODERS
3852
    if(avctx->dct_algo==FF_DCT_FASTINT) {
3853
        c->fdct = fdct_ifast;
3854
        c->fdct248 = fdct_ifast248;
3855
    }
3856
    else if(avctx->dct_algo==FF_DCT_FAAN) {
3857
        c->fdct = ff_faandct;
3858
        c->fdct248 = ff_faandct248;
3859
    }
3860
    else {
3861
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3862
        c->fdct248 = ff_fdct248_islow;
3863
    }
3864
#endif //CONFIG_ENCODERS
3865

    
3866
    if(avctx->lowres==1){
3867
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3868
            c->idct_put= ff_jref_idct4_put;
3869
            c->idct_add= ff_jref_idct4_add;
3870
        }else{
3871
            c->idct_put= ff_h264_lowres_idct_put_c;
3872
            c->idct_add= ff_h264_lowres_idct_add_c;
3873
        }
3874
        c->idct    = j_rev_dct4;
3875
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3876
    }else if(avctx->lowres==2){
3877
        c->idct_put= ff_jref_idct2_put;
3878
        c->idct_add= ff_jref_idct2_add;
3879
        c->idct    = j_rev_dct2;
3880
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3881
    }else if(avctx->lowres==3){
3882
        c->idct_put= ff_jref_idct1_put;
3883
        c->idct_add= ff_jref_idct1_add;
3884
        c->idct    = j_rev_dct1;
3885
        c->idct_permutation_type= FF_NO_IDCT_PERM;
3886
    }else{
3887
        if(avctx->idct_algo==FF_IDCT_INT){
3888
            c->idct_put= ff_jref_idct_put;
3889
            c->idct_add= ff_jref_idct_add;
3890
            c->idct    = j_rev_dct;
3891
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3892
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3893
                avctx->idct_algo==FF_IDCT_VP3){
3894
            c->idct_put= ff_vp3_idct_put_c;
3895
            c->idct_add= ff_vp3_idct_add_c;
3896
            c->idct    = ff_vp3_idct_c;
3897
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3898
        }else{ //accurate/default
3899
            c->idct_put= simple_idct_put;
3900
            c->idct_add= simple_idct_add;
3901
            c->idct    = simple_idct;
3902
            c->idct_permutation_type= FF_NO_IDCT_PERM;
3903
        }
3904
    }
3905

    
3906
    if (ENABLE_H264_DECODER) {
3907
        c->h264_idct_add= ff_h264_idct_add_c;
3908
        c->h264_idct8_add= ff_h264_idct8_add_c;
3909
        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3910
        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3911
    }
3912

    
3913
    c->get_pixels = get_pixels_c;
3914
    c->diff_pixels = diff_pixels_c;
3915
    c->put_pixels_clamped = put_pixels_clamped_c;
3916
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3917
    c->add_pixels_clamped = add_pixels_clamped_c;
3918
    c->add_pixels8 = add_pixels8_c;
3919
    c->add_pixels4 = add_pixels4_c;
3920
    c->sum_abs_dctelem = sum_abs_dctelem_c;
3921
    c->gmc1 = gmc1_c;
3922
    c->gmc = ff_gmc_c;
3923
    c->clear_blocks = clear_blocks_c;
3924
    c->pix_sum = pix_sum_c;
3925
    c->pix_norm1 = pix_norm1_c;
3926

    
3927
    /* TODO [0] 16  [1] 8 */
3928
    c->pix_abs[0][0] = pix_abs16_c;
3929
    c->pix_abs[0][1] = pix_abs16_x2_c;
3930
    c->pix_abs[0][2] = pix_abs16_y2_c;
3931
    c->pix_abs[0][3] = pix_abs16_xy2_c;
3932
    c->pix_abs[1][0] = pix_abs8_c;
3933
    c->pix_abs[1][1] = pix_abs8_x2_c;
3934
    c->pix_abs[1][2] = pix_abs8_y2_c;
3935
    c->pix_abs[1][3] = pix_abs8_xy2_c;
3936

    
3937
#define dspfunc(PFX, IDX, NUM) \
3938
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3939
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3940
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3941
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3942

    
3943
    dspfunc(put, 0, 16);
3944
    dspfunc(put_no_rnd, 0, 16);
3945
    dspfunc(put, 1, 8);
3946
    dspfunc(put_no_rnd, 1, 8);
3947
    dspfunc(put, 2, 4);
3948
    dspfunc(put, 3, 2);
3949

    
3950
    dspfunc(avg, 0, 16);
3951
    dspfunc(avg_no_rnd, 0, 16);
3952
    dspfunc(avg, 1, 8);
3953
    dspfunc(avg_no_rnd, 1, 8);
3954
    dspfunc(avg, 2, 4);
3955
    dspfunc(avg, 3, 2);
3956
#undef dspfunc
3957

    
3958
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3959
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3960

    
3961
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3962
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3963
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3964
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3965
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3966
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3967
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3968
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3969
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3970

    
3971
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3972
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3973
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3974
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3975
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3976
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3977
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3978
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3979
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3980

    
3981
#define dspfunc(PFX, IDX, NUM) \
3982
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3983
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3984
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3985
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3986
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3987
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3988
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3989
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3990
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3991
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3992
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3993
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3994
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3995
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3996
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3997
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3998

    
3999
    dspfunc(put_qpel, 0, 16);
4000
    dspfunc(put_no_rnd_qpel, 0, 16);
4001

    
4002
    dspfunc(avg_qpel, 0, 16);
4003
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4004

    
4005
    dspfunc(put_qpel, 1, 8);
4006
    dspfunc(put_no_rnd_qpel, 1, 8);
4007

    
4008
    dspfunc(avg_qpel, 1, 8);
4009
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4010

    
4011
    dspfunc(put_h264_qpel, 0, 16);
4012
    dspfunc(put_h264_qpel, 1, 8);
4013
    dspfunc(put_h264_qpel, 2, 4);
4014
    dspfunc(put_h264_qpel, 3, 2);
4015
    dspfunc(avg_h264_qpel, 0, 16);
4016
    dspfunc(avg_h264_qpel, 1, 8);
4017
    dspfunc(avg_h264_qpel, 2, 4);
4018

    
4019
#undef dspfunc
4020
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4021
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4022
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4023
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4024
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4025
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4026
    c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4027

    
4028
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4029
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4030
    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4031
    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4032
    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4033
    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4034
    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4035
    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4036
    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4037
    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4038
    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4039
    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4040
    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4041
    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4042
    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4043
    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4044
    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4045
    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4046
    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4047
    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4048

    
4049
#ifdef CONFIG_CAVS_DECODER
4050
    ff_cavsdsp_init(c,avctx);
4051
#endif
4052
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4053
    ff_vc1dsp_init(c,avctx);
4054
#endif
4055
#if defined(CONFIG_H264_ENCODER)
4056
    ff_h264dspenc_init(c,avctx);
4057
#endif
4058

    
4059
    c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4060
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4061
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4062
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4063
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4064
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4065
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4066
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4067

    
4068
#define SET_CMP_FUNC(name) \
4069
    c->name[0]= name ## 16_c;\
4070
    c->name[1]= name ## 8x8_c;
4071

    
4072
    SET_CMP_FUNC(hadamard8_diff)
4073
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4074
    SET_CMP_FUNC(dct_sad)
4075
    SET_CMP_FUNC(dct_max)
4076
#ifdef CONFIG_GPL
4077
    SET_CMP_FUNC(dct264_sad)
4078
#endif
4079
    c->sad[0]= pix_abs16_c;
4080
    c->sad[1]= pix_abs8_c;
4081
    c->sse[0]= sse16_c;
4082
    c->sse[1]= sse8_c;
4083
    c->sse[2]= sse4_c;
4084
    SET_CMP_FUNC(quant_psnr)
4085
    SET_CMP_FUNC(rd)
4086
    SET_CMP_FUNC(bit)
4087
    c->vsad[0]= vsad16_c;
4088
    c->vsad[4]= vsad_intra16_c;
4089
    c->vsse[0]= vsse16_c;
4090
    c->vsse[4]= vsse_intra16_c;
4091
    c->nsse[0]= nsse16_c;
4092
    c->nsse[1]= nsse8_c;
4093
#ifdef CONFIG_SNOW_ENCODER
4094
    c->w53[0]= w53_16_c;
4095
    c->w53[1]= w53_8_c;
4096
    c->w97[0]= w97_16_c;
4097
    c->w97[1]= w97_8_c;
4098
#endif
4099

    
4100
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4101

    
4102
    c->add_bytes= add_bytes_c;
4103
    c->diff_bytes= diff_bytes_c;
4104
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4105
    c->bswap_buf= bswap_buf;
4106

    
4107
    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4108
    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4109
    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4110
    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4111
    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4112
    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4113
    c->h264_loop_filter_strength= NULL;
4114

    
4115
    if (ENABLE_ANY_H263) {
4116
        c->h263_h_loop_filter= h263_h_loop_filter_c;
4117
        c->h263_v_loop_filter= h263_v_loop_filter_c;
4118
    }
4119

    
4120
    c->h261_loop_filter= h261_loop_filter_c;
4121

    
4122
    c->try_8x8basis= try_8x8basis_c;
4123
    c->add_8x8basis= add_8x8basis_c;
4124

    
4125
#ifdef CONFIG_SNOW_DECODER
4126
    c->vertical_compose97i = ff_snow_vertical_compose97i;
4127
    c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4128
    c->inner_add_yblock = ff_snow_inner_add_yblock;
4129
#endif
4130

    
4131
#ifdef CONFIG_VORBIS_DECODER
4132
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4133
#endif
4134
    c->vector_fmul = vector_fmul_c;
4135
    c->vector_fmul_reverse = vector_fmul_reverse_c;
4136
    c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4137
    c->float_to_int16 = ff_float_to_int16_c;
4138

    
4139
    c->shrink[0]= ff_img_copy_plane;
4140
    c->shrink[1]= ff_shrink22;
4141
    c->shrink[2]= ff_shrink44;
4142
    c->shrink[3]= ff_shrink88;
4143

    
4144
    c->prefetch= just_return;
4145

    
4146
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4147
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4148

    
4149
    if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4150
    if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4151
    if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4152
    if (ENABLE_SPARC)    dsputil_init_vis   (c, avctx);
4153
    if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4154
    if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4155
    if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4156
    if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4157
    if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4158

    
4159
    for(i=0; i<64; i++){
4160
        if(!c->put_2tap_qpel_pixels_tab[0][i])
4161
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4162
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
4163
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4164
    }
4165

    
4166
    switch(c->idct_permutation_type){
4167
    case FF_NO_IDCT_PERM:
4168
        for(i=0; i<64; i++)
4169
            c->idct_permutation[i]= i;
4170
        break;
4171
    case FF_LIBMPEG2_IDCT_PERM:
4172
        for(i=0; i<64; i++)
4173
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4174
        break;
4175
    case FF_SIMPLE_IDCT_PERM:
4176
        for(i=0; i<64; i++)
4177
            c->idct_permutation[i]= simple_mmx_permutation[i];
4178
        break;
4179
    case FF_TRANSPOSE_IDCT_PERM:
4180
        for(i=0; i<64; i++)
4181
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4182
        break;
4183
    case FF_PARTTRANS_IDCT_PERM:
4184
        for(i=0; i<64; i++)
4185
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4186
        break;
4187
    default:
4188
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4189
    }
4190
}
4191