Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ a2fc0f6a

History | View | Annotate | Download (162 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* ac3dec.c */
45
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
46

    
47
/* flacenc.c */
48
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
49

    
50
/* pngdec.c */
51
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
52

    
53
/* eaidct.c */
54
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
55

    
56
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
57
uint32_t ff_squareTbl[512] = {0, };
58

    
59
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
60
#define pb_7f (~0UL/255 * 0x7f)
61
#define pb_80 (~0UL/255 * 0x80)
62

    
63
const uint8_t ff_zigzag_direct[64] = {
64
    0,   1,  8, 16,  9,  2,  3, 10,
65
    17, 24, 32, 25, 18, 11,  4,  5,
66
    12, 19, 26, 33, 40, 48, 41, 34,
67
    27, 20, 13,  6,  7, 14, 21, 28,
68
    35, 42, 49, 56, 57, 50, 43, 36,
69
    29, 22, 15, 23, 30, 37, 44, 51,
70
    58, 59, 52, 45, 38, 31, 39, 46,
71
    53, 60, 61, 54, 47, 55, 62, 63
72
};
73

    
74
/* Specific zigzag scan for 248 idct. NOTE that unlike the
75
   specification, we interleave the fields */
76
const uint8_t ff_zigzag248_direct[64] = {
77
     0,  8,  1,  9, 16, 24,  2, 10,
78
    17, 25, 32, 40, 48, 56, 33, 41,
79
    18, 26,  3, 11,  4, 12, 19, 27,
80
    34, 42, 49, 57, 50, 58, 35, 43,
81
    20, 28,  5, 13,  6, 14, 21, 29,
82
    36, 44, 51, 59, 52, 60, 37, 45,
83
    22, 30,  7, 15, 23, 31, 38, 46,
84
    53, 61, 54, 62, 39, 47, 55, 63,
85
};
86

    
87
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
88
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
89

    
90
const uint8_t ff_alternate_horizontal_scan[64] = {
91
    0,  1,   2,  3,  8,  9, 16, 17,
92
    10, 11,  4,  5,  6,  7, 15, 14,
93
    13, 12, 19, 18, 24, 25, 32, 33,
94
    26, 27, 20, 21, 22, 23, 28, 29,
95
    30, 31, 34, 35, 40, 41, 48, 49,
96
    42, 43, 36, 37, 38, 39, 44, 45,
97
    46, 47, 50, 51, 56, 57, 58, 59,
98
    52, 53, 54, 55, 60, 61, 62, 63,
99
};
100

    
101
const uint8_t ff_alternate_vertical_scan[64] = {
102
    0,  8,  16, 24,  1,  9,  2, 10,
103
    17, 25, 32, 40, 48, 56, 57, 49,
104
    41, 33, 26, 18,  3, 11,  4, 12,
105
    19, 27, 34, 42, 50, 58, 35, 43,
106
    51, 59, 20, 28,  5, 13,  6, 14,
107
    21, 29, 36, 44, 52, 60, 37, 45,
108
    53, 61, 22, 30,  7, 15, 23, 31,
109
    38, 46, 54, 62, 39, 47, 55, 63,
110
};
111

    
112
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
113
const uint32_t ff_inverse[256]={
114
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
115
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
116
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
117
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
118
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
119
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
120
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
121
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
122
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
123
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
124
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
125
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
126
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
127
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
128
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
129
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
130
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
131
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
132
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
133
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
134
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
135
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
136
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
137
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
138
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
139
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
140
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
141
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
142
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
143
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
144
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
145
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
146
};
147

    
148
/* Input permutation for the simple_idct_mmx */
149
static const uint8_t simple_mmx_permutation[64]={
150
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
151
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
152
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
153
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
154
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
155
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
156
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
157
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
158
};
159

    
160
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
161

    
162
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
163
    int i;
164
    int end;
165

    
166
    st->scantable= src_scantable;
167

    
168
    for(i=0; i<64; i++){
169
        int j;
170
        j = src_scantable[i];
171
        st->permutated[i] = permutation[j];
172
#ifdef ARCH_POWERPC
173
        st->inverse[j] = i;
174
#endif
175
    }
176

    
177
    end=-1;
178
    for(i=0; i<64; i++){
179
        int j;
180
        j = st->permutated[i];
181
        if(j>end) end=j;
182
        st->raster_end[i]= end;
183
    }
184
}
185

    
186
static int pix_sum_c(uint8_t * pix, int line_size)
187
{
188
    int s, i, j;
189

    
190
    s = 0;
191
    for (i = 0; i < 16; i++) {
192
        for (j = 0; j < 16; j += 8) {
193
            s += pix[0];
194
            s += pix[1];
195
            s += pix[2];
196
            s += pix[3];
197
            s += pix[4];
198
            s += pix[5];
199
            s += pix[6];
200
            s += pix[7];
201
            pix += 8;
202
        }
203
        pix += line_size - 16;
204
    }
205
    return s;
206
}
207

    
208
static int pix_norm1_c(uint8_t * pix, int line_size)
209
{
210
    int s, i, j;
211
    uint32_t *sq = ff_squareTbl + 256;
212

    
213
    s = 0;
214
    for (i = 0; i < 16; i++) {
215
        for (j = 0; j < 16; j += 8) {
216
#if 0
217
            s += sq[pix[0]];
218
            s += sq[pix[1]];
219
            s += sq[pix[2]];
220
            s += sq[pix[3]];
221
            s += sq[pix[4]];
222
            s += sq[pix[5]];
223
            s += sq[pix[6]];
224
            s += sq[pix[7]];
225
#else
226
#if LONG_MAX > 2147483647
227
            register uint64_t x=*(uint64_t*)pix;
228
            s += sq[x&0xff];
229
            s += sq[(x>>8)&0xff];
230
            s += sq[(x>>16)&0xff];
231
            s += sq[(x>>24)&0xff];
232
            s += sq[(x>>32)&0xff];
233
            s += sq[(x>>40)&0xff];
234
            s += sq[(x>>48)&0xff];
235
            s += sq[(x>>56)&0xff];
236
#else
237
            register uint32_t x=*(uint32_t*)pix;
238
            s += sq[x&0xff];
239
            s += sq[(x>>8)&0xff];
240
            s += sq[(x>>16)&0xff];
241
            s += sq[(x>>24)&0xff];
242
            x=*(uint32_t*)(pix+4);
243
            s += sq[x&0xff];
244
            s += sq[(x>>8)&0xff];
245
            s += sq[(x>>16)&0xff];
246
            s += sq[(x>>24)&0xff];
247
#endif
248
#endif
249
            pix += 8;
250
        }
251
        pix += line_size - 16;
252
    }
253
    return s;
254
}
255

    
256
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
257
    int i;
258

    
259
    for(i=0; i+8<=w; i+=8){
260
        dst[i+0]= bswap_32(src[i+0]);
261
        dst[i+1]= bswap_32(src[i+1]);
262
        dst[i+2]= bswap_32(src[i+2]);
263
        dst[i+3]= bswap_32(src[i+3]);
264
        dst[i+4]= bswap_32(src[i+4]);
265
        dst[i+5]= bswap_32(src[i+5]);
266
        dst[i+6]= bswap_32(src[i+6]);
267
        dst[i+7]= bswap_32(src[i+7]);
268
    }
269
    for(;i<w; i++){
270
        dst[i+0]= bswap_32(src[i+0]);
271
    }
272
}
273

    
274
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
275
{
276
    int s, i;
277
    uint32_t *sq = ff_squareTbl + 256;
278

    
279
    s = 0;
280
    for (i = 0; i < h; i++) {
281
        s += sq[pix1[0] - pix2[0]];
282
        s += sq[pix1[1] - pix2[1]];
283
        s += sq[pix1[2] - pix2[2]];
284
        s += sq[pix1[3] - pix2[3]];
285
        pix1 += line_size;
286
        pix2 += line_size;
287
    }
288
    return s;
289
}
290

    
291
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
292
{
293
    int s, i;
294
    uint32_t *sq = ff_squareTbl + 256;
295

    
296
    s = 0;
297
    for (i = 0; i < h; i++) {
298
        s += sq[pix1[0] - pix2[0]];
299
        s += sq[pix1[1] - pix2[1]];
300
        s += sq[pix1[2] - pix2[2]];
301
        s += sq[pix1[3] - pix2[3]];
302
        s += sq[pix1[4] - pix2[4]];
303
        s += sq[pix1[5] - pix2[5]];
304
        s += sq[pix1[6] - pix2[6]];
305
        s += sq[pix1[7] - pix2[7]];
306
        pix1 += line_size;
307
        pix2 += line_size;
308
    }
309
    return s;
310
}
311

    
312
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
313
{
314
    int s, i;
315
    uint32_t *sq = ff_squareTbl + 256;
316

    
317
    s = 0;
318
    for (i = 0; i < h; i++) {
319
        s += sq[pix1[ 0] - pix2[ 0]];
320
        s += sq[pix1[ 1] - pix2[ 1]];
321
        s += sq[pix1[ 2] - pix2[ 2]];
322
        s += sq[pix1[ 3] - pix2[ 3]];
323
        s += sq[pix1[ 4] - pix2[ 4]];
324
        s += sq[pix1[ 5] - pix2[ 5]];
325
        s += sq[pix1[ 6] - pix2[ 6]];
326
        s += sq[pix1[ 7] - pix2[ 7]];
327
        s += sq[pix1[ 8] - pix2[ 8]];
328
        s += sq[pix1[ 9] - pix2[ 9]];
329
        s += sq[pix1[10] - pix2[10]];
330
        s += sq[pix1[11] - pix2[11]];
331
        s += sq[pix1[12] - pix2[12]];
332
        s += sq[pix1[13] - pix2[13]];
333
        s += sq[pix1[14] - pix2[14]];
334
        s += sq[pix1[15] - pix2[15]];
335

    
336
        pix1 += line_size;
337
        pix2 += line_size;
338
    }
339
    return s;
340
}
341

    
342

    
343
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
344
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
345
    int s, i, j;
346
    const int dec_count= w==8 ? 3 : 4;
347
    int tmp[32*32];
348
    int level, ori;
349
    static const int scale[2][2][4][4]={
350
      {
351
        {
352
            // 9/7 8x8 dec=3
353
            {268, 239, 239, 213},
354
            {  0, 224, 224, 152},
355
            {  0, 135, 135, 110},
356
        },{
357
            // 9/7 16x16 or 32x32 dec=4
358
            {344, 310, 310, 280},
359
            {  0, 320, 320, 228},
360
            {  0, 175, 175, 136},
361
            {  0, 129, 129, 102},
362
        }
363
      },{
364
        {
365
            // 5/3 8x8 dec=3
366
            {275, 245, 245, 218},
367
            {  0, 230, 230, 156},
368
            {  0, 138, 138, 113},
369
        },{
370
            // 5/3 16x16 or 32x32 dec=4
371
            {352, 317, 317, 286},
372
            {  0, 328, 328, 233},
373
            {  0, 180, 180, 140},
374
            {  0, 132, 132, 105},
375
        }
376
      }
377
    };
378

    
379
    for (i = 0; i < h; i++) {
380
        for (j = 0; j < w; j+=4) {
381
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
382
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
383
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
384
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
385
        }
386
        pix1 += line_size;
387
        pix2 += line_size;
388
    }
389

    
390
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
391

    
392
    s=0;
393
    assert(w==h);
394
    for(level=0; level<dec_count; level++){
395
        for(ori= level ? 1 : 0; ori<4; ori++){
396
            int size= w>>(dec_count-level);
397
            int sx= (ori&1) ? size : 0;
398
            int stride= 32<<(dec_count-level);
399
            int sy= (ori&2) ? stride>>1 : 0;
400

    
401
            for(i=0; i<size; i++){
402
                for(j=0; j<size; j++){
403
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
404
                    s += FFABS(v);
405
                }
406
            }
407
        }
408
    }
409
    assert(s>=0);
410
    return s>>9;
411
}
412

    
413
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
414
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
415
}
416

    
417
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
419
}
420

    
421
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
423
}
424

    
425
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
427
}
428

    
429
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
431
}
432

    
433
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
435
}
436
#endif
437

    
438
/* draw the edges of width 'w' of an image of size width, height */
439
//FIXME check that this is ok for mpeg4 interlaced
440
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
441
{
442
    uint8_t *ptr, *last_line;
443
    int i;
444

    
445
    last_line = buf + (height - 1) * wrap;
446
    for(i=0;i<w;i++) {
447
        /* top and bottom */
448
        memcpy(buf - (i + 1) * wrap, buf, width);
449
        memcpy(last_line + (i + 1) * wrap, last_line, width);
450
    }
451
    /* left and right */
452
    ptr = buf;
453
    for(i=0;i<height;i++) {
454
        memset(ptr - w, ptr[0], w);
455
        memset(ptr + width, ptr[width-1], w);
456
        ptr += wrap;
457
    }
458
    /* corners */
459
    for(i=0;i<w;i++) {
460
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
461
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
462
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
463
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
464
    }
465
}
466

    
467
/**
468
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
469
 * @param buf destination buffer
470
 * @param src source buffer
471
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
472
 * @param block_w width of block
473
 * @param block_h height of block
474
 * @param src_x x coordinate of the top left sample of the block in the source buffer
475
 * @param src_y y coordinate of the top left sample of the block in the source buffer
476
 * @param w width of the source buffer
477
 * @param h height of the source buffer
478
 */
479
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
480
                                    int src_x, int src_y, int w, int h){
481
    int x, y;
482
    int start_y, start_x, end_y, end_x;
483

    
484
    if(src_y>= h){
485
        src+= (h-1-src_y)*linesize;
486
        src_y=h-1;
487
    }else if(src_y<=-block_h){
488
        src+= (1-block_h-src_y)*linesize;
489
        src_y=1-block_h;
490
    }
491
    if(src_x>= w){
492
        src+= (w-1-src_x);
493
        src_x=w-1;
494
    }else if(src_x<=-block_w){
495
        src+= (1-block_w-src_x);
496
        src_x=1-block_w;
497
    }
498

    
499
    start_y= FFMAX(0, -src_y);
500
    start_x= FFMAX(0, -src_x);
501
    end_y= FFMIN(block_h, h-src_y);
502
    end_x= FFMIN(block_w, w-src_x);
503

    
504
    // copy existing part
505
    for(y=start_y; y<end_y; y++){
506
        for(x=start_x; x<end_x; x++){
507
            buf[x + y*linesize]= src[x + y*linesize];
508
        }
509
    }
510

    
511
    //top
512
    for(y=0; y<start_y; y++){
513
        for(x=start_x; x<end_x; x++){
514
            buf[x + y*linesize]= buf[x + start_y*linesize];
515
        }
516
    }
517

    
518
    //bottom
519
    for(y=end_y; y<block_h; y++){
520
        for(x=start_x; x<end_x; x++){
521
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
522
        }
523
    }
524

    
525
    for(y=0; y<block_h; y++){
526
       //left
527
        for(x=0; x<start_x; x++){
528
            buf[x + y*linesize]= buf[start_x + y*linesize];
529
        }
530

    
531
       //right
532
        for(x=end_x; x<block_w; x++){
533
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
534
        }
535
    }
536
}
537

    
538
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
539
{
540
    int i;
541

    
542
    /* read the pixels */
543
    for(i=0;i<8;i++) {
544
        block[0] = pixels[0];
545
        block[1] = pixels[1];
546
        block[2] = pixels[2];
547
        block[3] = pixels[3];
548
        block[4] = pixels[4];
549
        block[5] = pixels[5];
550
        block[6] = pixels[6];
551
        block[7] = pixels[7];
552
        pixels += line_size;
553
        block += 8;
554
    }
555
}
556

    
557
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
558
                          const uint8_t *s2, int stride){
559
    int i;
560

    
561
    /* read the pixels */
562
    for(i=0;i<8;i++) {
563
        block[0] = s1[0] - s2[0];
564
        block[1] = s1[1] - s2[1];
565
        block[2] = s1[2] - s2[2];
566
        block[3] = s1[3] - s2[3];
567
        block[4] = s1[4] - s2[4];
568
        block[5] = s1[5] - s2[5];
569
        block[6] = s1[6] - s2[6];
570
        block[7] = s1[7] - s2[7];
571
        s1 += stride;
572
        s2 += stride;
573
        block += 8;
574
    }
575
}
576

    
577

    
578
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
579
                                 int line_size)
580
{
581
    int i;
582
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
583

    
584
    /* read the pixels */
585
    for(i=0;i<8;i++) {
586
        pixels[0] = cm[block[0]];
587
        pixels[1] = cm[block[1]];
588
        pixels[2] = cm[block[2]];
589
        pixels[3] = cm[block[3]];
590
        pixels[4] = cm[block[4]];
591
        pixels[5] = cm[block[5]];
592
        pixels[6] = cm[block[6]];
593
        pixels[7] = cm[block[7]];
594

    
595
        pixels += line_size;
596
        block += 8;
597
    }
598
}
599

    
600
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
601
                                 int line_size)
602
{
603
    int i;
604
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
605

    
606
    /* read the pixels */
607
    for(i=0;i<4;i++) {
608
        pixels[0] = cm[block[0]];
609
        pixels[1] = cm[block[1]];
610
        pixels[2] = cm[block[2]];
611
        pixels[3] = cm[block[3]];
612

    
613
        pixels += line_size;
614
        block += 8;
615
    }
616
}
617

    
618
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
619
                                 int line_size)
620
{
621
    int i;
622
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
623

    
624
    /* read the pixels */
625
    for(i=0;i<2;i++) {
626
        pixels[0] = cm[block[0]];
627
        pixels[1] = cm[block[1]];
628

    
629
        pixels += line_size;
630
        block += 8;
631
    }
632
}
633

    
634
static void put_signed_pixels_clamped_c(const DCTELEM *block,
635
                                        uint8_t *restrict pixels,
636
                                        int line_size)
637
{
638
    int i, j;
639

    
640
    for (i = 0; i < 8; i++) {
641
        for (j = 0; j < 8; j++) {
642
            if (*block < -128)
643
                *pixels = 0;
644
            else if (*block > 127)
645
                *pixels = 255;
646
            else
647
                *pixels = (uint8_t)(*block + 128);
648
            block++;
649
            pixels++;
650
        }
651
        pixels += (line_size - 8);
652
    }
653
}
654

    
655
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
656
                          int line_size)
657
{
658
    int i;
659
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
660

    
661
    /* read the pixels */
662
    for(i=0;i<8;i++) {
663
        pixels[0] = cm[pixels[0] + block[0]];
664
        pixels[1] = cm[pixels[1] + block[1]];
665
        pixels[2] = cm[pixels[2] + block[2]];
666
        pixels[3] = cm[pixels[3] + block[3]];
667
        pixels[4] = cm[pixels[4] + block[4]];
668
        pixels[5] = cm[pixels[5] + block[5]];
669
        pixels[6] = cm[pixels[6] + block[6]];
670
        pixels[7] = cm[pixels[7] + block[7]];
671
        pixels += line_size;
672
        block += 8;
673
    }
674
}
675

    
676
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
677
                          int line_size)
678
{
679
    int i;
680
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
681

    
682
    /* read the pixels */
683
    for(i=0;i<4;i++) {
684
        pixels[0] = cm[pixels[0] + block[0]];
685
        pixels[1] = cm[pixels[1] + block[1]];
686
        pixels[2] = cm[pixels[2] + block[2]];
687
        pixels[3] = cm[pixels[3] + block[3]];
688
        pixels += line_size;
689
        block += 8;
690
    }
691
}
692

    
693
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
694
                          int line_size)
695
{
696
    int i;
697
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
698

    
699
    /* read the pixels */
700
    for(i=0;i<2;i++) {
701
        pixels[0] = cm[pixels[0] + block[0]];
702
        pixels[1] = cm[pixels[1] + block[1]];
703
        pixels += line_size;
704
        block += 8;
705
    }
706
}
707

    
708
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
709
{
710
    int i;
711
    for(i=0;i<8;i++) {
712
        pixels[0] += block[0];
713
        pixels[1] += block[1];
714
        pixels[2] += block[2];
715
        pixels[3] += block[3];
716
        pixels[4] += block[4];
717
        pixels[5] += block[5];
718
        pixels[6] += block[6];
719
        pixels[7] += block[7];
720
        pixels += line_size;
721
        block += 8;
722
    }
723
}
724

    
725
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
726
{
727
    int i;
728
    for(i=0;i<4;i++) {
729
        pixels[0] += block[0];
730
        pixels[1] += block[1];
731
        pixels[2] += block[2];
732
        pixels[3] += block[3];
733
        pixels += line_size;
734
        block += 4;
735
    }
736
}
737

    
738
static int sum_abs_dctelem_c(DCTELEM *block)
739
{
740
    int sum=0, i;
741
    for(i=0; i<64; i++)
742
        sum+= FFABS(block[i]);
743
    return sum;
744
}
745

    
746
#if 0
747

748
#define PIXOP2(OPNAME, OP) \
749
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
750
{\
751
    int i;\
752
    for(i=0; i<h; i++){\
753
        OP(*((uint64_t*)block), AV_RN64(pixels));\
754
        pixels+=line_size;\
755
        block +=line_size;\
756
    }\
757
}\
758
\
759
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760
{\
761
    int i;\
762
    for(i=0; i<h; i++){\
763
        const uint64_t a= AV_RN64(pixels  );\
764
        const uint64_t b= AV_RN64(pixels+1);\
765
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
766
        pixels+=line_size;\
767
        block +=line_size;\
768
    }\
769
}\
770
\
771
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
772
{\
773
    int i;\
774
    for(i=0; i<h; i++){\
775
        const uint64_t a= AV_RN64(pixels  );\
776
        const uint64_t b= AV_RN64(pixels+1);\
777
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
778
        pixels+=line_size;\
779
        block +=line_size;\
780
    }\
781
}\
782
\
783
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
784
{\
785
    int i;\
786
    for(i=0; i<h; i++){\
787
        const uint64_t a= AV_RN64(pixels          );\
788
        const uint64_t b= AV_RN64(pixels+line_size);\
789
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
790
        pixels+=line_size;\
791
        block +=line_size;\
792
    }\
793
}\
794
\
795
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
796
{\
797
    int i;\
798
    for(i=0; i<h; i++){\
799
        const uint64_t a= AV_RN64(pixels          );\
800
        const uint64_t b= AV_RN64(pixels+line_size);\
801
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
802
        pixels+=line_size;\
803
        block +=line_size;\
804
    }\
805
}\
806
\
807
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
808
{\
809
        int i;\
810
        const uint64_t a= AV_RN64(pixels  );\
811
        const uint64_t b= AV_RN64(pixels+1);\
812
        uint64_t l0=  (a&0x0303030303030303ULL)\
813
                    + (b&0x0303030303030303ULL)\
814
                    + 0x0202020202020202ULL;\
815
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
816
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
817
        uint64_t l1,h1;\
818
\
819
        pixels+=line_size;\
820
        for(i=0; i<h; i+=2){\
821
            uint64_t a= AV_RN64(pixels  );\
822
            uint64_t b= AV_RN64(pixels+1);\
823
            l1=  (a&0x0303030303030303ULL)\
824
               + (b&0x0303030303030303ULL);\
825
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
826
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
827
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
828
            pixels+=line_size;\
829
            block +=line_size;\
830
            a= AV_RN64(pixels  );\
831
            b= AV_RN64(pixels+1);\
832
            l0=  (a&0x0303030303030303ULL)\
833
               + (b&0x0303030303030303ULL)\
834
               + 0x0202020202020202ULL;\
835
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
836
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
837
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
838
            pixels+=line_size;\
839
            block +=line_size;\
840
        }\
841
}\
842
\
843
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
844
{\
845
        int i;\
846
        const uint64_t a= AV_RN64(pixels  );\
847
        const uint64_t b= AV_RN64(pixels+1);\
848
        uint64_t l0=  (a&0x0303030303030303ULL)\
849
                    + (b&0x0303030303030303ULL)\
850
                    + 0x0101010101010101ULL;\
851
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
852
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
853
        uint64_t l1,h1;\
854
\
855
        pixels+=line_size;\
856
        for(i=0; i<h; i+=2){\
857
            uint64_t a= AV_RN64(pixels  );\
858
            uint64_t b= AV_RN64(pixels+1);\
859
            l1=  (a&0x0303030303030303ULL)\
860
               + (b&0x0303030303030303ULL);\
861
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
862
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
863
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
864
            pixels+=line_size;\
865
            block +=line_size;\
866
            a= AV_RN64(pixels  );\
867
            b= AV_RN64(pixels+1);\
868
            l0=  (a&0x0303030303030303ULL)\
869
               + (b&0x0303030303030303ULL)\
870
               + 0x0101010101010101ULL;\
871
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
872
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
873
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
874
            pixels+=line_size;\
875
            block +=line_size;\
876
        }\
877
}\
878
\
879
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
880
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
881
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
882
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
883
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
884
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
886

887
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
888
#else // 64 bit variant
889

    
890
#define PIXOP2(OPNAME, OP) \
891
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892
    int i;\
893
    for(i=0; i<h; i++){\
894
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
895
        pixels+=line_size;\
896
        block +=line_size;\
897
    }\
898
}\
899
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900
    int i;\
901
    for(i=0; i<h; i++){\
902
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
903
        pixels+=line_size;\
904
        block +=line_size;\
905
    }\
906
}\
907
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908
    int i;\
909
    for(i=0; i<h; i++){\
910
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
911
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
912
        pixels+=line_size;\
913
        block +=line_size;\
914
    }\
915
}\
916
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
918
}\
919
\
920
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921
                                                int src_stride1, int src_stride2, int h){\
922
    int i;\
923
    for(i=0; i<h; i++){\
924
        uint32_t a,b;\
925
        a= AV_RN32(&src1[i*src_stride1  ]);\
926
        b= AV_RN32(&src2[i*src_stride2  ]);\
927
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
928
        a= AV_RN32(&src1[i*src_stride1+4]);\
929
        b= AV_RN32(&src2[i*src_stride2+4]);\
930
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
931
    }\
932
}\
933
\
934
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
935
                                                int src_stride1, int src_stride2, int h){\
936
    int i;\
937
    for(i=0; i<h; i++){\
938
        uint32_t a,b;\
939
        a= AV_RN32(&src1[i*src_stride1  ]);\
940
        b= AV_RN32(&src2[i*src_stride2  ]);\
941
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
942
        a= AV_RN32(&src1[i*src_stride1+4]);\
943
        b= AV_RN32(&src2[i*src_stride2+4]);\
944
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
945
    }\
946
}\
947
\
948
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
949
                                                int src_stride1, int src_stride2, int h){\
950
    int i;\
951
    for(i=0; i<h; i++){\
952
        uint32_t a,b;\
953
        a= AV_RN32(&src1[i*src_stride1  ]);\
954
        b= AV_RN32(&src2[i*src_stride2  ]);\
955
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
956
    }\
957
}\
958
\
959
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
960
                                                int src_stride1, int src_stride2, int h){\
961
    int i;\
962
    for(i=0; i<h; i++){\
963
        uint32_t a,b;\
964
        a= AV_RN16(&src1[i*src_stride1  ]);\
965
        b= AV_RN16(&src2[i*src_stride2  ]);\
966
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
967
    }\
968
}\
969
\
970
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971
                                                int src_stride1, int src_stride2, int h){\
972
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
973
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
974
}\
975
\
976
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
977
                                                int src_stride1, int src_stride2, int h){\
978
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
979
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
980
}\
981
\
982
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
984
}\
985
\
986
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
988
}\
989
\
990
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
992
}\
993
\
994
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
996
}\
997
\
998
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
999
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000
    int i;\
1001
    for(i=0; i<h; i++){\
1002
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1003
        a= AV_RN32(&src1[i*src_stride1]);\
1004
        b= AV_RN32(&src2[i*src_stride2]);\
1005
        c= AV_RN32(&src3[i*src_stride3]);\
1006
        d= AV_RN32(&src4[i*src_stride4]);\
1007
        l0=  (a&0x03030303UL)\
1008
           + (b&0x03030303UL)\
1009
           + 0x02020202UL;\
1010
        h0= ((a&0xFCFCFCFCUL)>>2)\
1011
          + ((b&0xFCFCFCFCUL)>>2);\
1012
        l1=  (c&0x03030303UL)\
1013
           + (d&0x03030303UL);\
1014
        h1= ((c&0xFCFCFCFCUL)>>2)\
1015
          + ((d&0xFCFCFCFCUL)>>2);\
1016
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017
        a= AV_RN32(&src1[i*src_stride1+4]);\
1018
        b= AV_RN32(&src2[i*src_stride2+4]);\
1019
        c= AV_RN32(&src3[i*src_stride3+4]);\
1020
        d= AV_RN32(&src4[i*src_stride4+4]);\
1021
        l0=  (a&0x03030303UL)\
1022
           + (b&0x03030303UL)\
1023
           + 0x02020202UL;\
1024
        h0= ((a&0xFCFCFCFCUL)>>2)\
1025
          + ((b&0xFCFCFCFCUL)>>2);\
1026
        l1=  (c&0x03030303UL)\
1027
           + (d&0x03030303UL);\
1028
        h1= ((c&0xFCFCFCFCUL)>>2)\
1029
          + ((d&0xFCFCFCFCUL)>>2);\
1030
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1031
    }\
1032
}\
1033
\
1034
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1036
}\
1037
\
1038
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1040
}\
1041
\
1042
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1044
}\
1045
\
1046
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1048
}\
1049
\
1050
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1052
    int i;\
1053
    for(i=0; i<h; i++){\
1054
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1055
        a= AV_RN32(&src1[i*src_stride1]);\
1056
        b= AV_RN32(&src2[i*src_stride2]);\
1057
        c= AV_RN32(&src3[i*src_stride3]);\
1058
        d= AV_RN32(&src4[i*src_stride4]);\
1059
        l0=  (a&0x03030303UL)\
1060
           + (b&0x03030303UL)\
1061
           + 0x01010101UL;\
1062
        h0= ((a&0xFCFCFCFCUL)>>2)\
1063
          + ((b&0xFCFCFCFCUL)>>2);\
1064
        l1=  (c&0x03030303UL)\
1065
           + (d&0x03030303UL);\
1066
        h1= ((c&0xFCFCFCFCUL)>>2)\
1067
          + ((d&0xFCFCFCFCUL)>>2);\
1068
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069
        a= AV_RN32(&src1[i*src_stride1+4]);\
1070
        b= AV_RN32(&src2[i*src_stride2+4]);\
1071
        c= AV_RN32(&src3[i*src_stride3+4]);\
1072
        d= AV_RN32(&src4[i*src_stride4+4]);\
1073
        l0=  (a&0x03030303UL)\
1074
           + (b&0x03030303UL)\
1075
           + 0x01010101UL;\
1076
        h0= ((a&0xFCFCFCFCUL)>>2)\
1077
          + ((b&0xFCFCFCFCUL)>>2);\
1078
        l1=  (c&0x03030303UL)\
1079
           + (d&0x03030303UL);\
1080
        h1= ((c&0xFCFCFCFCUL)>>2)\
1081
          + ((d&0xFCFCFCFCUL)>>2);\
1082
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083
    }\
1084
}\
1085
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089
}\
1090
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094
}\
1095
\
1096
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1097
{\
1098
        int i, a0, b0, a1, b1;\
1099
        a0= pixels[0];\
1100
        b0= pixels[1] + 2;\
1101
        a0 += b0;\
1102
        b0 += pixels[2];\
1103
\
1104
        pixels+=line_size;\
1105
        for(i=0; i<h; i+=2){\
1106
            a1= pixels[0];\
1107
            b1= pixels[1];\
1108
            a1 += b1;\
1109
            b1 += pixels[2];\
1110
\
1111
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1112
            block[1]= (b1+b0)>>2;\
1113
\
1114
            pixels+=line_size;\
1115
            block +=line_size;\
1116
\
1117
            a0= pixels[0];\
1118
            b0= pixels[1] + 2;\
1119
            a0 += b0;\
1120
            b0 += pixels[2];\
1121
\
1122
            block[0]= (a1+a0)>>2;\
1123
            block[1]= (b1+b0)>>2;\
1124
            pixels+=line_size;\
1125
            block +=line_size;\
1126
        }\
1127
}\
1128
\
1129
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1130
{\
1131
        int i;\
1132
        const uint32_t a= AV_RN32(pixels  );\
1133
        const uint32_t b= AV_RN32(pixels+1);\
1134
        uint32_t l0=  (a&0x03030303UL)\
1135
                    + (b&0x03030303UL)\
1136
                    + 0x02020202UL;\
1137
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138
                   + ((b&0xFCFCFCFCUL)>>2);\
1139
        uint32_t l1,h1;\
1140
\
1141
        pixels+=line_size;\
1142
        for(i=0; i<h; i+=2){\
1143
            uint32_t a= AV_RN32(pixels  );\
1144
            uint32_t b= AV_RN32(pixels+1);\
1145
            l1=  (a&0x03030303UL)\
1146
               + (b&0x03030303UL);\
1147
            h1= ((a&0xFCFCFCFCUL)>>2)\
1148
              + ((b&0xFCFCFCFCUL)>>2);\
1149
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150
            pixels+=line_size;\
1151
            block +=line_size;\
1152
            a= AV_RN32(pixels  );\
1153
            b= AV_RN32(pixels+1);\
1154
            l0=  (a&0x03030303UL)\
1155
               + (b&0x03030303UL)\
1156
               + 0x02020202UL;\
1157
            h0= ((a&0xFCFCFCFCUL)>>2)\
1158
              + ((b&0xFCFCFCFCUL)>>2);\
1159
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1160
            pixels+=line_size;\
1161
            block +=line_size;\
1162
        }\
1163
}\
1164
\
1165
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1166
{\
1167
    int j;\
1168
    for(j=0; j<2; j++){\
1169
        int i;\
1170
        const uint32_t a= AV_RN32(pixels  );\
1171
        const uint32_t b= AV_RN32(pixels+1);\
1172
        uint32_t l0=  (a&0x03030303UL)\
1173
                    + (b&0x03030303UL)\
1174
                    + 0x02020202UL;\
1175
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176
                   + ((b&0xFCFCFCFCUL)>>2);\
1177
        uint32_t l1,h1;\
1178
\
1179
        pixels+=line_size;\
1180
        for(i=0; i<h; i+=2){\
1181
            uint32_t a= AV_RN32(pixels  );\
1182
            uint32_t b= AV_RN32(pixels+1);\
1183
            l1=  (a&0x03030303UL)\
1184
               + (b&0x03030303UL);\
1185
            h1= ((a&0xFCFCFCFCUL)>>2)\
1186
              + ((b&0xFCFCFCFCUL)>>2);\
1187
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1188
            pixels+=line_size;\
1189
            block +=line_size;\
1190
            a= AV_RN32(pixels  );\
1191
            b= AV_RN32(pixels+1);\
1192
            l0=  (a&0x03030303UL)\
1193
               + (b&0x03030303UL)\
1194
               + 0x02020202UL;\
1195
            h0= ((a&0xFCFCFCFCUL)>>2)\
1196
              + ((b&0xFCFCFCFCUL)>>2);\
1197
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1198
            pixels+=line_size;\
1199
            block +=line_size;\
1200
        }\
1201
        pixels+=4-line_size*(h+1);\
1202
        block +=4-line_size*h;\
1203
    }\
1204
}\
1205
\
1206
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1207
{\
1208
    int j;\
1209
    for(j=0; j<2; j++){\
1210
        int i;\
1211
        const uint32_t a= AV_RN32(pixels  );\
1212
        const uint32_t b= AV_RN32(pixels+1);\
1213
        uint32_t l0=  (a&0x03030303UL)\
1214
                    + (b&0x03030303UL)\
1215
                    + 0x01010101UL;\
1216
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217
                   + ((b&0xFCFCFCFCUL)>>2);\
1218
        uint32_t l1,h1;\
1219
\
1220
        pixels+=line_size;\
1221
        for(i=0; i<h; i+=2){\
1222
            uint32_t a= AV_RN32(pixels  );\
1223
            uint32_t b= AV_RN32(pixels+1);\
1224
            l1=  (a&0x03030303UL)\
1225
               + (b&0x03030303UL);\
1226
            h1= ((a&0xFCFCFCFCUL)>>2)\
1227
              + ((b&0xFCFCFCFCUL)>>2);\
1228
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1229
            pixels+=line_size;\
1230
            block +=line_size;\
1231
            a= AV_RN32(pixels  );\
1232
            b= AV_RN32(pixels+1);\
1233
            l0=  (a&0x03030303UL)\
1234
               + (b&0x03030303UL)\
1235
               + 0x01010101UL;\
1236
            h0= ((a&0xFCFCFCFCUL)>>2)\
1237
              + ((b&0xFCFCFCFCUL)>>2);\
1238
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1239
            pixels+=line_size;\
1240
            block +=line_size;\
1241
        }\
1242
        pixels+=4-line_size*(h+1);\
1243
        block +=4-line_size*h;\
1244
    }\
1245
}\
1246
\
1247
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1248
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1252
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1255

    
1256
#define op_avg(a, b) a = rnd_avg32(a, b)
1257
#endif
1258
#define op_put(a, b) a = b
1259

    
1260
PIXOP2(avg, op_avg)
1261
PIXOP2(put, op_put)
1262
#undef op_avg
1263
#undef op_put
1264

    
1265
#define avg2(a,b) ((a+b+1)>>1)
1266
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1267

    
1268
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1270
}
1271

    
1272
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1274
}
1275

    
1276
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1277
{
1278
    const int A=(16-x16)*(16-y16);
1279
    const int B=(   x16)*(16-y16);
1280
    const int C=(16-x16)*(   y16);
1281
    const int D=(   x16)*(   y16);
1282
    int i;
1283

    
1284
    for(i=0; i<h; i++)
1285
    {
1286
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1294
        dst+= stride;
1295
        src+= stride;
1296
    }
1297
}
1298

    
1299
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1301
{
1302
    int y, vx, vy;
1303
    const int s= 1<<shift;
1304

    
1305
    width--;
1306
    height--;
1307

    
1308
    for(y=0; y<h; y++){
1309
        int x;
1310

    
1311
        vx= ox;
1312
        vy= oy;
1313
        for(x=0; x<8; x++){ //XXX FIXME optimize
1314
            int src_x, src_y, frac_x, frac_y, index;
1315

    
1316
            src_x= vx>>16;
1317
            src_y= vy>>16;
1318
            frac_x= src_x&(s-1);
1319
            frac_y= src_y&(s-1);
1320
            src_x>>=shift;
1321
            src_y>>=shift;
1322

    
1323
            if((unsigned)src_x < width){
1324
                if((unsigned)src_y < height){
1325
                    index= src_x + src_y*stride;
1326
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1327
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1328
                                        + (  src[index+stride  ]*(s-frac_x)
1329
                                           + src[index+stride+1]*   frac_x )*   frac_y
1330
                                        + r)>>(shift*2);
1331
                }else{
1332
                    index= src_x + av_clip(src_y, 0, height)*stride;
1333
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1334
                                          + src[index       +1]*   frac_x )*s
1335
                                        + r)>>(shift*2);
1336
                }
1337
            }else{
1338
                if((unsigned)src_y < height){
1339
                    index= av_clip(src_x, 0, width) + src_y*stride;
1340
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1341
                                           + src[index+stride  ]*   frac_y )*s
1342
                                        + r)>>(shift*2);
1343
                }else{
1344
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345
                    dst[y*stride + x]=    src[index         ];
1346
                }
1347
            }
1348

    
1349
            vx+= dxx;
1350
            vy+= dyx;
1351
        }
1352
        ox += dxy;
1353
        oy += dyy;
1354
    }
1355
}
1356

    
1357
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358
    switch(width){
1359
    case 2: put_pixels2_c (dst, src, stride, height); break;
1360
    case 4: put_pixels4_c (dst, src, stride, height); break;
1361
    case 8: put_pixels8_c (dst, src, stride, height); break;
1362
    case 16:put_pixels16_c(dst, src, stride, height); break;
1363
    }
1364
}
1365

    
1366
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367
    int i,j;
1368
    for (i=0; i < height; i++) {
1369
      for (j=0; j < width; j++) {
1370
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1371
      }
1372
      src += stride;
1373
      dst += stride;
1374
    }
1375
}
1376

    
1377
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378
    int i,j;
1379
    for (i=0; i < height; i++) {
1380
      for (j=0; j < width; j++) {
1381
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1382
      }
1383
      src += stride;
1384
      dst += stride;
1385
    }
1386
}
1387

    
1388
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389
    int i,j;
1390
    for (i=0; i < height; i++) {
1391
      for (j=0; j < width; j++) {
1392
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1393
      }
1394
      src += stride;
1395
      dst += stride;
1396
    }
1397
}
1398

    
1399
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400
    int i,j;
1401
    for (i=0; i < height; i++) {
1402
      for (j=0; j < width; j++) {
1403
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1404
      }
1405
      src += stride;
1406
      dst += stride;
1407
    }
1408
}
1409

    
1410
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411
    int i,j;
1412
    for (i=0; i < height; i++) {
1413
      for (j=0; j < width; j++) {
1414
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1415
      }
1416
      src += stride;
1417
      dst += stride;
1418
    }
1419
}
1420

    
1421
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422
    int i,j;
1423
    for (i=0; i < height; i++) {
1424
      for (j=0; j < width; j++) {
1425
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1426
      }
1427
      src += stride;
1428
      dst += stride;
1429
    }
1430
}
1431

    
1432
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433
    int i,j;
1434
    for (i=0; i < height; i++) {
1435
      for (j=0; j < width; j++) {
1436
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1437
      }
1438
      src += stride;
1439
      dst += stride;
1440
    }
1441
}
1442

    
1443
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444
    int i,j;
1445
    for (i=0; i < height; i++) {
1446
      for (j=0; j < width; j++) {
1447
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1448
      }
1449
      src += stride;
1450
      dst += stride;
1451
    }
1452
}
1453

    
1454
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455
    switch(width){
1456
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1457
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1458
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1459
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1460
    }
1461
}
1462

    
1463
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464
    int i,j;
1465
    for (i=0; i < height; i++) {
1466
      for (j=0; j < width; j++) {
1467
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1468
      }
1469
      src += stride;
1470
      dst += stride;
1471
    }
1472
}
1473

    
1474
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475
    int i,j;
1476
    for (i=0; i < height; i++) {
1477
      for (j=0; j < width; j++) {
1478
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1479
      }
1480
      src += stride;
1481
      dst += stride;
1482
    }
1483
}
1484

    
1485
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1486
    int i,j;
1487
    for (i=0; i < height; i++) {
1488
      for (j=0; j < width; j++) {
1489
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1490
      }
1491
      src += stride;
1492
      dst += stride;
1493
    }
1494
}
1495

    
1496
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1497
    int i,j;
1498
    for (i=0; i < height; i++) {
1499
      for (j=0; j < width; j++) {
1500
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1501
      }
1502
      src += stride;
1503
      dst += stride;
1504
    }
1505
}
1506

    
1507
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1508
    int i,j;
1509
    for (i=0; i < height; i++) {
1510
      for (j=0; j < width; j++) {
1511
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1512
      }
1513
      src += stride;
1514
      dst += stride;
1515
    }
1516
}
1517

    
1518
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1519
    int i,j;
1520
    for (i=0; i < height; i++) {
1521
      for (j=0; j < width; j++) {
1522
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1523
      }
1524
      src += stride;
1525
      dst += stride;
1526
    }
1527
}
1528

    
1529
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530
    int i,j;
1531
    for (i=0; i < height; i++) {
1532
      for (j=0; j < width; j++) {
1533
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1534
      }
1535
      src += stride;
1536
      dst += stride;
1537
    }
1538
}
1539

    
1540
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541
    int i,j;
1542
    for (i=0; i < height; i++) {
1543
      for (j=0; j < width; j++) {
1544
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1545
      }
1546
      src += stride;
1547
      dst += stride;
1548
    }
1549
}
1550
#if 0
1551
#define TPEL_WIDTH(width)\
1552
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1570
#endif
1571

    
1572
#define H264_CHROMA_MC(OPNAME, OP)\
1573
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574
    const int A=(8-x)*(8-y);\
1575
    const int B=(  x)*(8-y);\
1576
    const int C=(8-x)*(  y);\
1577
    const int D=(  x)*(  y);\
1578
    int i;\
1579
    \
1580
    assert(x<8 && y<8 && x>=0 && y>=0);\
1581
\
1582
    if(D){\
1583
        for(i=0; i<h; i++){\
1584
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1586
            dst+= stride;\
1587
            src+= stride;\
1588
        }\
1589
    }else{\
1590
        const int E= B+C;\
1591
        const int step= C ? stride : 1;\
1592
        for(i=0; i<h; i++){\
1593
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1594
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1595
            dst+= stride;\
1596
            src+= stride;\
1597
        }\
1598
    }\
1599
}\
1600
\
1601
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602
    const int A=(8-x)*(8-y);\
1603
    const int B=(  x)*(8-y);\
1604
    const int C=(8-x)*(  y);\
1605
    const int D=(  x)*(  y);\
1606
    int i;\
1607
    \
1608
    assert(x<8 && y<8 && x>=0 && y>=0);\
1609
\
1610
    if(D){\
1611
        for(i=0; i<h; i++){\
1612
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1616
            dst+= stride;\
1617
            src+= stride;\
1618
        }\
1619
    }else{\
1620
        const int E= B+C;\
1621
        const int step= C ? stride : 1;\
1622
        for(i=0; i<h; i++){\
1623
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1624
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1625
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1626
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1627
            dst+= stride;\
1628
            src+= stride;\
1629
        }\
1630
    }\
1631
}\
1632
\
1633
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634
    const int A=(8-x)*(8-y);\
1635
    const int B=(  x)*(8-y);\
1636
    const int C=(8-x)*(  y);\
1637
    const int D=(  x)*(  y);\
1638
    int i;\
1639
    \
1640
    assert(x<8 && y<8 && x>=0 && y>=0);\
1641
\
1642
    if(D){\
1643
        for(i=0; i<h; i++){\
1644
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1652
            dst+= stride;\
1653
            src+= stride;\
1654
        }\
1655
    }else{\
1656
        const int E= B+C;\
1657
        const int step= C ? stride : 1;\
1658
        for(i=0; i<h; i++){\
1659
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1660
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1661
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1662
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1663
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1664
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1665
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1666
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1667
            dst+= stride;\
1668
            src+= stride;\
1669
        }\
1670
    }\
1671
}
1672

    
1673
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674
#define op_put(a, b) a = (((b) + 32)>>6)
1675

    
1676
H264_CHROMA_MC(put_       , op_put)
1677
H264_CHROMA_MC(avg_       , op_avg)
1678
#undef op_avg
1679
#undef op_put
1680

    
1681
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682
    const int A=(8-x)*(8-y);
1683
    const int B=(  x)*(8-y);
1684
    const int C=(8-x)*(  y);
1685
    const int D=(  x)*(  y);
1686
    int i;
1687

    
1688
    assert(x<8 && y<8 && x>=0 && y>=0);
1689

    
1690
    for(i=0; i<h; i++)
1691
    {
1692
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1700
        dst+= stride;
1701
        src+= stride;
1702
    }
1703
}
1704

    
1705
#define QPEL_MC(r, OPNAME, RND, OP) \
1706
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1708
    int i;\
1709
    for(i=0; i<h; i++)\
1710
    {\
1711
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1719
        dst+=dstStride;\
1720
        src+=srcStride;\
1721
    }\
1722
}\
1723
\
1724
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1725
    const int w=8;\
1726
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727
    int i;\
1728
    for(i=0; i<w; i++)\
1729
    {\
1730
        const int src0= src[0*srcStride];\
1731
        const int src1= src[1*srcStride];\
1732
        const int src2= src[2*srcStride];\
1733
        const int src3= src[3*srcStride];\
1734
        const int src4= src[4*srcStride];\
1735
        const int src5= src[5*srcStride];\
1736
        const int src6= src[6*srcStride];\
1737
        const int src7= src[7*srcStride];\
1738
        const int src8= src[8*srcStride];\
1739
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1747
        dst++;\
1748
        src++;\
1749
    }\
1750
}\
1751
\
1752
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754
    int i;\
1755
    \
1756
    for(i=0; i<h; i++)\
1757
    {\
1758
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1774
        dst+=dstStride;\
1775
        src+=srcStride;\
1776
    }\
1777
}\
1778
\
1779
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781
    int i;\
1782
    const int w=16;\
1783
    for(i=0; i<w; i++)\
1784
    {\
1785
        const int src0= src[0*srcStride];\
1786
        const int src1= src[1*srcStride];\
1787
        const int src2= src[2*srcStride];\
1788
        const int src3= src[3*srcStride];\
1789
        const int src4= src[4*srcStride];\
1790
        const int src5= src[5*srcStride];\
1791
        const int src6= src[6*srcStride];\
1792
        const int src7= src[7*srcStride];\
1793
        const int src8= src[8*srcStride];\
1794
        const int src9= src[9*srcStride];\
1795
        const int src10= src[10*srcStride];\
1796
        const int src11= src[11*srcStride];\
1797
        const int src12= src[12*srcStride];\
1798
        const int src13= src[13*srcStride];\
1799
        const int src14= src[14*srcStride];\
1800
        const int src15= src[15*srcStride];\
1801
        const int src16= src[16*srcStride];\
1802
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1818
        dst++;\
1819
        src++;\
1820
    }\
1821
}\
1822
\
1823
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1825
}\
1826
\
1827
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828
    uint8_t half[64];\
1829
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1831
}\
1832
\
1833
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1835
}\
1836
\
1837
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838
    uint8_t half[64];\
1839
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1841
}\
1842
\
1843
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t full[16*9];\
1845
    uint8_t half[64];\
1846
    copy_block9(full, src, 16, stride, 9);\
1847
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1849
}\
1850
\
1851
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852
    uint8_t full[16*9];\
1853
    copy_block9(full, src, 16, stride, 9);\
1854
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1855
}\
1856
\
1857
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858
    uint8_t full[16*9];\
1859
    uint8_t half[64];\
1860
    copy_block9(full, src, 16, stride, 9);\
1861
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1863
}\
1864
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865
    uint8_t full[16*9];\
1866
    uint8_t halfH[72];\
1867
    uint8_t halfV[64];\
1868
    uint8_t halfHV[64];\
1869
    copy_block9(full, src, 16, stride, 9);\
1870
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874
}\
1875
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876
    uint8_t full[16*9];\
1877
    uint8_t halfH[72];\
1878
    uint8_t halfHV[64];\
1879
    copy_block9(full, src, 16, stride, 9);\
1880
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1884
}\
1885
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886
    uint8_t full[16*9];\
1887
    uint8_t halfH[72];\
1888
    uint8_t halfV[64];\
1889
    uint8_t halfHV[64];\
1890
    copy_block9(full, src, 16, stride, 9);\
1891
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1895
}\
1896
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897
    uint8_t full[16*9];\
1898
    uint8_t halfH[72];\
1899
    uint8_t halfHV[64];\
1900
    copy_block9(full, src, 16, stride, 9);\
1901
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1905
}\
1906
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907
    uint8_t full[16*9];\
1908
    uint8_t halfH[72];\
1909
    uint8_t halfV[64];\
1910
    uint8_t halfHV[64];\
1911
    copy_block9(full, src, 16, stride, 9);\
1912
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1916
}\
1917
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918
    uint8_t full[16*9];\
1919
    uint8_t halfH[72];\
1920
    uint8_t halfHV[64];\
1921
    copy_block9(full, src, 16, stride, 9);\
1922
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1926
}\
1927
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t full[16*9];\
1929
    uint8_t halfH[72];\
1930
    uint8_t halfV[64];\
1931
    uint8_t halfHV[64];\
1932
    copy_block9(full, src, 16, stride, 9);\
1933
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1934
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1937
}\
1938
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939
    uint8_t full[16*9];\
1940
    uint8_t halfH[72];\
1941
    uint8_t halfHV[64];\
1942
    copy_block9(full, src, 16, stride, 9);\
1943
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1947
}\
1948
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t halfH[72];\
1950
    uint8_t halfHV[64];\
1951
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1954
}\
1955
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t halfH[72];\
1957
    uint8_t halfHV[64];\
1958
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1961
}\
1962
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963
    uint8_t full[16*9];\
1964
    uint8_t halfH[72];\
1965
    uint8_t halfV[64];\
1966
    uint8_t halfHV[64];\
1967
    copy_block9(full, src, 16, stride, 9);\
1968
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1972
}\
1973
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974
    uint8_t full[16*9];\
1975
    uint8_t halfH[72];\
1976
    copy_block9(full, src, 16, stride, 9);\
1977
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1980
}\
1981
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982
    uint8_t full[16*9];\
1983
    uint8_t halfH[72];\
1984
    uint8_t halfV[64];\
1985
    uint8_t halfHV[64];\
1986
    copy_block9(full, src, 16, stride, 9);\
1987
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1991
}\
1992
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993
    uint8_t full[16*9];\
1994
    uint8_t halfH[72];\
1995
    copy_block9(full, src, 16, stride, 9);\
1996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1999
}\
2000
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t halfH[72];\
2002
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2004
}\
2005
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2007
}\
2008
\
2009
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    uint8_t half[256];\
2011
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2013
}\
2014
\
2015
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2017
}\
2018
\
2019
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2020
    uint8_t half[256];\
2021
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2023
}\
2024
\
2025
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t full[24*17];\
2027
    uint8_t half[256];\
2028
    copy_block17(full, src, 24, stride, 17);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2031
}\
2032
\
2033
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034
    uint8_t full[24*17];\
2035
    copy_block17(full, src, 24, stride, 17);\
2036
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2037
}\
2038
\
2039
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040
    uint8_t full[24*17];\
2041
    uint8_t half[256];\
2042
    copy_block17(full, src, 24, stride, 17);\
2043
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2045
}\
2046
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047
    uint8_t full[24*17];\
2048
    uint8_t halfH[272];\
2049
    uint8_t halfV[256];\
2050
    uint8_t halfHV[256];\
2051
    copy_block17(full, src, 24, stride, 17);\
2052
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056
}\
2057
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[24*17];\
2059
    uint8_t halfH[272];\
2060
    uint8_t halfHV[256];\
2061
    copy_block17(full, src, 24, stride, 17);\
2062
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2066
}\
2067
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068
    uint8_t full[24*17];\
2069
    uint8_t halfH[272];\
2070
    uint8_t halfV[256];\
2071
    uint8_t halfHV[256];\
2072
    copy_block17(full, src, 24, stride, 17);\
2073
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2077
}\
2078
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079
    uint8_t full[24*17];\
2080
    uint8_t halfH[272];\
2081
    uint8_t halfHV[256];\
2082
    copy_block17(full, src, 24, stride, 17);\
2083
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2087
}\
2088
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089
    uint8_t full[24*17];\
2090
    uint8_t halfH[272];\
2091
    uint8_t halfV[256];\
2092
    uint8_t halfHV[256];\
2093
    copy_block17(full, src, 24, stride, 17);\
2094
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2098
}\
2099
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100
    uint8_t full[24*17];\
2101
    uint8_t halfH[272];\
2102
    uint8_t halfHV[256];\
2103
    copy_block17(full, src, 24, stride, 17);\
2104
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2108
}\
2109
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110
    uint8_t full[24*17];\
2111
    uint8_t halfH[272];\
2112
    uint8_t halfV[256];\
2113
    uint8_t halfHV[256];\
2114
    copy_block17(full, src, 24, stride, 17);\
2115
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2116
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2119
}\
2120
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121
    uint8_t full[24*17];\
2122
    uint8_t halfH[272];\
2123
    uint8_t halfHV[256];\
2124
    copy_block17(full, src, 24, stride, 17);\
2125
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2129
}\
2130
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131
    uint8_t halfH[272];\
2132
    uint8_t halfHV[256];\
2133
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2136
}\
2137
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t halfH[272];\
2139
    uint8_t halfHV[256];\
2140
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2143
}\
2144
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145
    uint8_t full[24*17];\
2146
    uint8_t halfH[272];\
2147
    uint8_t halfV[256];\
2148
    uint8_t halfHV[256];\
2149
    copy_block17(full, src, 24, stride, 17);\
2150
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2154
}\
2155
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156
    uint8_t full[24*17];\
2157
    uint8_t halfH[272];\
2158
    copy_block17(full, src, 24, stride, 17);\
2159
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2162
}\
2163
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164
    uint8_t full[24*17];\
2165
    uint8_t halfH[272];\
2166
    uint8_t halfV[256];\
2167
    uint8_t halfHV[256];\
2168
    copy_block17(full, src, 24, stride, 17);\
2169
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2173
}\
2174
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175
    uint8_t full[24*17];\
2176
    uint8_t halfH[272];\
2177
    copy_block17(full, src, 24, stride, 17);\
2178
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2181
}\
2182
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183
    uint8_t halfH[272];\
2184
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2186
}
2187

    
2188
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190
#define op_put(a, b) a = cm[((b) + 16)>>5]
2191
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2192

    
2193
QPEL_MC(0, put_       , _       , op_put)
2194
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195
QPEL_MC(0, avg_       , _       , op_avg)
2196
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2197
#undef op_avg
2198
#undef op_avg_no_rnd
2199
#undef op_put
2200
#undef op_put_no_rnd
2201

    
2202
#if 1
2203
#define H264_LOWPASS(OPNAME, OP, OP2) \
2204
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205
    const int h=2;\
2206
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207
    int i;\
2208
    for(i=0; i<h; i++)\
2209
    {\
2210
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2212
        dst+=dstStride;\
2213
        src+=srcStride;\
2214
    }\
2215
}\
2216
\
2217
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2218
    const int w=2;\
2219
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220
    int i;\
2221
    for(i=0; i<w; i++)\
2222
    {\
2223
        const int srcB= src[-2*srcStride];\
2224
        const int srcA= src[-1*srcStride];\
2225
        const int src0= src[0 *srcStride];\
2226
        const int src1= src[1 *srcStride];\
2227
        const int src2= src[2 *srcStride];\
2228
        const int src3= src[3 *srcStride];\
2229
        const int src4= src[4 *srcStride];\
2230
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2232
        dst++;\
2233
        src++;\
2234
    }\
2235
}\
2236
\
2237
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2238
    const int h=2;\
2239
    const int w=2;\
2240
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2241
    int i;\
2242
    src -= 2*srcStride;\
2243
    for(i=0; i<h+5; i++)\
2244
    {\
2245
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2247
        tmp+=tmpStride;\
2248
        src+=srcStride;\
2249
    }\
2250
    tmp -= tmpStride*(h+5-2);\
2251
    for(i=0; i<w; i++)\
2252
    {\
2253
        const int tmpB= tmp[-2*tmpStride];\
2254
        const int tmpA= tmp[-1*tmpStride];\
2255
        const int tmp0= tmp[0 *tmpStride];\
2256
        const int tmp1= tmp[1 *tmpStride];\
2257
        const int tmp2= tmp[2 *tmpStride];\
2258
        const int tmp3= tmp[3 *tmpStride];\
2259
        const int tmp4= tmp[4 *tmpStride];\
2260
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2262
        dst++;\
2263
        tmp++;\
2264
    }\
2265
}\
2266
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267
    const int h=4;\
2268
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269
    int i;\
2270
    for(i=0; i<h; i++)\
2271
    {\
2272
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2276
        dst+=dstStride;\
2277
        src+=srcStride;\
2278
    }\
2279
}\
2280
\
2281
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282
    const int w=4;\
2283
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284
    int i;\
2285
    for(i=0; i<w; i++)\
2286
    {\
2287
        const int srcB= src[-2*srcStride];\
2288
        const int srcA= src[-1*srcStride];\
2289
        const int src0= src[0 *srcStride];\
2290
        const int src1= src[1 *srcStride];\
2291
        const int src2= src[2 *srcStride];\
2292
        const int src3= src[3 *srcStride];\
2293
        const int src4= src[4 *srcStride];\
2294
        const int src5= src[5 *srcStride];\
2295
        const int src6= src[6 *srcStride];\
2296
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2300
        dst++;\
2301
        src++;\
2302
    }\
2303
}\
2304
\
2305
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2306
    const int h=4;\
2307
    const int w=4;\
2308
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309
    int i;\
2310
    src -= 2*srcStride;\
2311
    for(i=0; i<h+5; i++)\
2312
    {\
2313
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2317
        tmp+=tmpStride;\
2318
        src+=srcStride;\
2319
    }\
2320
    tmp -= tmpStride*(h+5-2);\
2321
    for(i=0; i<w; i++)\
2322
    {\
2323
        const int tmpB= tmp[-2*tmpStride];\
2324
        const int tmpA= tmp[-1*tmpStride];\
2325
        const int tmp0= tmp[0 *tmpStride];\
2326
        const int tmp1= tmp[1 *tmpStride];\
2327
        const int tmp2= tmp[2 *tmpStride];\
2328
        const int tmp3= tmp[3 *tmpStride];\
2329
        const int tmp4= tmp[4 *tmpStride];\
2330
        const int tmp5= tmp[5 *tmpStride];\
2331
        const int tmp6= tmp[6 *tmpStride];\
2332
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2336
        dst++;\
2337
        tmp++;\
2338
    }\
2339
}\
2340
\
2341
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342
    const int h=8;\
2343
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2344
    int i;\
2345
    for(i=0; i<h; i++)\
2346
    {\
2347
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2355
        dst+=dstStride;\
2356
        src+=srcStride;\
2357
    }\
2358
}\
2359
\
2360
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2361
    const int w=8;\
2362
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2363
    int i;\
2364
    for(i=0; i<w; i++)\
2365
    {\
2366
        const int srcB= src[-2*srcStride];\
2367
        const int srcA= src[-1*srcStride];\
2368
        const int src0= src[0 *srcStride];\
2369
        const int src1= src[1 *srcStride];\
2370
        const int src2= src[2 *srcStride];\
2371
        const int src3= src[3 *srcStride];\
2372
        const int src4= src[4 *srcStride];\
2373
        const int src5= src[5 *srcStride];\
2374
        const int src6= src[6 *srcStride];\
2375
        const int src7= src[7 *srcStride];\
2376
        const int src8= src[8 *srcStride];\
2377
        const int src9= src[9 *srcStride];\
2378
        const int src10=src[10*srcStride];\
2379
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2387
        dst++;\
2388
        src++;\
2389
    }\
2390
}\
2391
\
2392
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2393
    const int h=8;\
2394
    const int w=8;\
2395
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2396
    int i;\
2397
    src -= 2*srcStride;\
2398
    for(i=0; i<h+5; i++)\
2399
    {\
2400
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2408
        tmp+=tmpStride;\
2409
        src+=srcStride;\
2410
    }\
2411
    tmp -= tmpStride*(h+5-2);\
2412
    for(i=0; i<w; i++)\
2413
    {\
2414
        const int tmpB= tmp[-2*tmpStride];\
2415
        const int tmpA= tmp[-1*tmpStride];\
2416
        const int tmp0= tmp[0 *tmpStride];\
2417
        const int tmp1= tmp[1 *tmpStride];\
2418
        const int tmp2= tmp[2 *tmpStride];\
2419
        const int tmp3= tmp[3 *tmpStride];\
2420
        const int tmp4= tmp[4 *tmpStride];\
2421
        const int tmp5= tmp[5 *tmpStride];\
2422
        const int tmp6= tmp[6 *tmpStride];\
2423
        const int tmp7= tmp[7 *tmpStride];\
2424
        const int tmp8= tmp[8 *tmpStride];\
2425
        const int tmp9= tmp[9 *tmpStride];\
2426
        const int tmp10=tmp[10*tmpStride];\
2427
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2435
        dst++;\
2436
        tmp++;\
2437
    }\
2438
}\
2439
\
2440
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2442
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443
    src += 8*srcStride;\
2444
    dst += 8*dstStride;\
2445
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2446
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2447
}\
2448
\
2449
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2451
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452
    src += 8*srcStride;\
2453
    dst += 8*dstStride;\
2454
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2455
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2456
}\
2457
\
2458
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2460
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461
    src += 8*srcStride;\
2462
    dst += 8*dstStride;\
2463
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2464
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2465
}\
2466

    
2467
#define H264_MC(OPNAME, SIZE) \
2468
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    uint8_t half[SIZE*SIZE];\
2474
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2476
}\
2477
\
2478
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2480
}\
2481
\
2482
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483
    uint8_t half[SIZE*SIZE];\
2484
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2486
}\
2487
\
2488
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489
    uint8_t full[SIZE*(SIZE+5)];\
2490
    uint8_t * const full_mid= full + SIZE*2;\
2491
    uint8_t half[SIZE*SIZE];\
2492
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2493
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2495
}\
2496
\
2497
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498
    uint8_t full[SIZE*(SIZE+5)];\
2499
    uint8_t * const full_mid= full + SIZE*2;\
2500
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2501
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2502
}\
2503
\
2504
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505
    uint8_t full[SIZE*(SIZE+5)];\
2506
    uint8_t * const full_mid= full + SIZE*2;\
2507
    uint8_t half[SIZE*SIZE];\
2508
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2509
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2511
}\
2512
\
2513
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514
    uint8_t full[SIZE*(SIZE+5)];\
2515
    uint8_t * const full_mid= full + SIZE*2;\
2516
    uint8_t halfH[SIZE*SIZE];\
2517
    uint8_t halfV[SIZE*SIZE];\
2518
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2520
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2522
}\
2523
\
2524
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525
    uint8_t full[SIZE*(SIZE+5)];\
2526
    uint8_t * const full_mid= full + SIZE*2;\
2527
    uint8_t halfH[SIZE*SIZE];\
2528
    uint8_t halfV[SIZE*SIZE];\
2529
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2531
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2533
}\
2534
\
2535
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536
    uint8_t full[SIZE*(SIZE+5)];\
2537
    uint8_t * const full_mid= full + SIZE*2;\
2538
    uint8_t halfH[SIZE*SIZE];\
2539
    uint8_t halfV[SIZE*SIZE];\
2540
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2542
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2544
}\
2545
\
2546
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547
    uint8_t full[SIZE*(SIZE+5)];\
2548
    uint8_t * const full_mid= full + SIZE*2;\
2549
    uint8_t halfH[SIZE*SIZE];\
2550
    uint8_t halfV[SIZE*SIZE];\
2551
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2553
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2555
}\
2556
\
2557
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558
    int16_t tmp[SIZE*(SIZE+5)];\
2559
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2560
}\
2561
\
2562
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563
    int16_t tmp[SIZE*(SIZE+5)];\
2564
    uint8_t halfH[SIZE*SIZE];\
2565
    uint8_t halfHV[SIZE*SIZE];\
2566
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2569
}\
2570
\
2571
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572
    int16_t tmp[SIZE*(SIZE+5)];\
2573
    uint8_t halfH[SIZE*SIZE];\
2574
    uint8_t halfHV[SIZE*SIZE];\
2575
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2578
}\
2579
\
2580
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581
    uint8_t full[SIZE*(SIZE+5)];\
2582
    uint8_t * const full_mid= full + SIZE*2;\
2583
    int16_t tmp[SIZE*(SIZE+5)];\
2584
    uint8_t halfV[SIZE*SIZE];\
2585
    uint8_t halfHV[SIZE*SIZE];\
2586
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2587
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2590
}\
2591
\
2592
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593
    uint8_t full[SIZE*(SIZE+5)];\
2594
    uint8_t * const full_mid= full + SIZE*2;\
2595
    int16_t tmp[SIZE*(SIZE+5)];\
2596
    uint8_t halfV[SIZE*SIZE];\
2597
    uint8_t halfHV[SIZE*SIZE];\
2598
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2599
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2602
}\
2603

    
2604
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2607
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2609

    
2610
H264_LOWPASS(put_       , op_put, op2_put)
2611
H264_LOWPASS(avg_       , op_avg, op2_avg)
2612
H264_MC(put_, 2)
2613
H264_MC(put_, 4)
2614
H264_MC(put_, 8)
2615
H264_MC(put_, 16)
2616
H264_MC(avg_, 4)
2617
H264_MC(avg_, 8)
2618
H264_MC(avg_, 16)
2619

    
2620
#undef op_avg
2621
#undef op_put
2622
#undef op2_avg
2623
#undef op2_put
2624
#endif
2625

    
2626
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628
#define H264_WEIGHT(W,H) \
2629
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2630
    int y; \
2631
    offset <<= log2_denom; \
2632
    if(log2_denom) offset += 1<<(log2_denom-1); \
2633
    for(y=0; y<H; y++, block += stride){ \
2634
        op_scale1(0); \
2635
        op_scale1(1); \
2636
        if(W==2) continue; \
2637
        op_scale1(2); \
2638
        op_scale1(3); \
2639
        if(W==4) continue; \
2640
        op_scale1(4); \
2641
        op_scale1(5); \
2642
        op_scale1(6); \
2643
        op_scale1(7); \
2644
        if(W==8) continue; \
2645
        op_scale1(8); \
2646
        op_scale1(9); \
2647
        op_scale1(10); \
2648
        op_scale1(11); \
2649
        op_scale1(12); \
2650
        op_scale1(13); \
2651
        op_scale1(14); \
2652
        op_scale1(15); \
2653
    } \
2654
} \
2655
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2656
    int y; \
2657
    offset = ((offset + 1) | 1) << log2_denom; \
2658
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2659
        op_scale2(0); \
2660
        op_scale2(1); \
2661
        if(W==2) continue; \
2662
        op_scale2(2); \
2663
        op_scale2(3); \
2664
        if(W==4) continue; \
2665
        op_scale2(4); \
2666
        op_scale2(5); \
2667
        op_scale2(6); \
2668
        op_scale2(7); \
2669
        if(W==8) continue; \
2670
        op_scale2(8); \
2671
        op_scale2(9); \
2672
        op_scale2(10); \
2673
        op_scale2(11); \
2674
        op_scale2(12); \
2675
        op_scale2(13); \
2676
        op_scale2(14); \
2677
        op_scale2(15); \
2678
    } \
2679
}
2680

    
2681
H264_WEIGHT(16,16)
2682
H264_WEIGHT(16,8)
2683
H264_WEIGHT(8,16)
2684
H264_WEIGHT(8,8)
2685
H264_WEIGHT(8,4)
2686
H264_WEIGHT(4,8)
2687
H264_WEIGHT(4,4)
2688
H264_WEIGHT(4,2)
2689
H264_WEIGHT(2,4)
2690
H264_WEIGHT(2,2)
2691

    
2692
#undef op_scale1
2693
#undef op_scale2
2694
#undef H264_WEIGHT
2695

    
2696
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2698
    int i;
2699

    
2700
    for(i=0; i<h; i++){
2701
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2709
        dst+=dstStride;
2710
        src+=srcStride;
2711
    }
2712
}
2713

    
2714
#ifdef CONFIG_CAVS_DECODER
2715
/* AVS specific */
2716
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2717

    
2718
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719
    put_pixels8_c(dst, src, stride, 8);
2720
}
2721
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722
    avg_pixels8_c(dst, src, stride, 8);
2723
}
2724
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725
    put_pixels16_c(dst, src, stride, 16);
2726
}
2727
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728
    avg_pixels16_c(dst, src, stride, 16);
2729
}
2730
#endif /* CONFIG_CAVS_DECODER */
2731

    
2732
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2733
/* VC-1 specific */
2734
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2735

    
2736
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737
    put_pixels8_c(dst, src, stride, 8);
2738
}
2739
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2740

    
2741
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2742

    
2743
/* H264 specific */
2744
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2745

    
2746
#if defined(CONFIG_RV40_DECODER)
2747
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2748
    put_pixels16_xy2_c(dst, src, stride, 16);
2749
}
2750
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2751
    avg_pixels16_xy2_c(dst, src, stride, 16);
2752
}
2753
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2754
    put_pixels8_xy2_c(dst, src, stride, 8);
2755
}
2756
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2757
    avg_pixels8_xy2_c(dst, src, stride, 8);
2758
}
2759

    
2760
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2761
#endif /* CONFIG_RV40_DECODER */
2762

    
2763
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2764
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2765
    int i;
2766

    
2767
    for(i=0; i<w; i++){
2768
        const int src_1= src[ -srcStride];
2769
        const int src0 = src[0          ];
2770
        const int src1 = src[  srcStride];
2771
        const int src2 = src[2*srcStride];
2772
        const int src3 = src[3*srcStride];
2773
        const int src4 = src[4*srcStride];
2774
        const int src5 = src[5*srcStride];
2775
        const int src6 = src[6*srcStride];
2776
        const int src7 = src[7*srcStride];
2777
        const int src8 = src[8*srcStride];
2778
        const int src9 = src[9*srcStride];
2779
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2780
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2781
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2782
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2783
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2784
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2785
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2786
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2787
        src++;
2788
        dst++;
2789
    }
2790
}
2791

    
2792
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2793
    put_pixels8_c(dst, src, stride, 8);
2794
}
2795

    
2796
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2797
    uint8_t half[64];
2798
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2799
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2800
}
2801

    
2802
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2803
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2804
}
2805

    
2806
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2807
    uint8_t half[64];
2808
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2809
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2810
}
2811

    
2812
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2813
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2814
}
2815

    
2816
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2817
    uint8_t halfH[88];
2818
    uint8_t halfV[64];
2819
    uint8_t halfHV[64];
2820
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2821
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2822
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2823
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2824
}
2825
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2826
    uint8_t halfH[88];
2827
    uint8_t halfV[64];
2828
    uint8_t halfHV[64];
2829
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2830
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2831
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2832
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2833
}
2834
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2835
    uint8_t halfH[88];
2836
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2837
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2838
}
2839

    
2840
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2841
    if(ENABLE_ANY_H263) {
2842
    int x;
2843
    const int strength= ff_h263_loop_filter_strength[qscale];
2844

    
2845
    for(x=0; x<8; x++){
2846
        int d1, d2, ad1;
2847
        int p0= src[x-2*stride];
2848
        int p1= src[x-1*stride];
2849
        int p2= src[x+0*stride];
2850
        int p3= src[x+1*stride];
2851
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2852

    
2853
        if     (d<-2*strength) d1= 0;
2854
        else if(d<-  strength) d1=-2*strength - d;
2855
        else if(d<   strength) d1= d;
2856
        else if(d< 2*strength) d1= 2*strength - d;
2857
        else                   d1= 0;
2858

    
2859
        p1 += d1;
2860
        p2 -= d1;
2861
        if(p1&256) p1= ~(p1>>31);
2862
        if(p2&256) p2= ~(p2>>31);
2863

    
2864
        src[x-1*stride] = p1;
2865
        src[x+0*stride] = p2;
2866

    
2867
        ad1= FFABS(d1)>>1;
2868

    
2869
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2870

    
2871
        src[x-2*stride] = p0 - d2;
2872
        src[x+  stride] = p3 + d2;
2873
    }
2874
    }
2875
}
2876

    
2877
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2878
    if(ENABLE_ANY_H263) {
2879
    int y;
2880
    const int strength= ff_h263_loop_filter_strength[qscale];
2881

    
2882
    for(y=0; y<8; y++){
2883
        int d1, d2, ad1;
2884
        int p0= src[y*stride-2];
2885
        int p1= src[y*stride-1];
2886
        int p2= src[y*stride+0];
2887
        int p3= src[y*stride+1];
2888
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2889

    
2890
        if     (d<-2*strength) d1= 0;
2891
        else if(d<-  strength) d1=-2*strength - d;
2892
        else if(d<   strength) d1= d;
2893
        else if(d< 2*strength) d1= 2*strength - d;
2894
        else                   d1= 0;
2895

    
2896
        p1 += d1;
2897
        p2 -= d1;
2898
        if(p1&256) p1= ~(p1>>31);
2899
        if(p2&256) p2= ~(p2>>31);
2900

    
2901
        src[y*stride-1] = p1;
2902
        src[y*stride+0] = p2;
2903

    
2904
        ad1= FFABS(d1)>>1;
2905

    
2906
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2907

    
2908
        src[y*stride-2] = p0 - d2;
2909
        src[y*stride+1] = p3 + d2;
2910
    }
2911
    }
2912
}
2913

    
2914
static void h261_loop_filter_c(uint8_t *src, int stride){
2915
    int x,y,xy,yz;
2916
    int temp[64];
2917

    
2918
    for(x=0; x<8; x++){
2919
        temp[x      ] = 4*src[x           ];
2920
        temp[x + 7*8] = 4*src[x + 7*stride];
2921
    }
2922
    for(y=1; y<7; y++){
2923
        for(x=0; x<8; x++){
2924
            xy = y * stride + x;
2925
            yz = y * 8 + x;
2926
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2927
        }
2928
    }
2929

    
2930
    for(y=0; y<8; y++){
2931
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2932
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2933
        for(x=1; x<7; x++){
2934
            xy = y * stride + x;
2935
            yz = y * 8 + x;
2936
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2937
        }
2938
    }
2939
}
2940

    
2941
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2942
{
2943
    int i, d;
2944
    for( i = 0; i < 4; i++ ) {
2945
        if( tc0[i] < 0 ) {
2946
            pix += 4*ystride;
2947
            continue;
2948
        }
2949
        for( d = 0; d < 4; d++ ) {
2950
            const int p0 = pix[-1*xstride];
2951
            const int p1 = pix[-2*xstride];
2952
            const int p2 = pix[-3*xstride];
2953
            const int q0 = pix[0];
2954
            const int q1 = pix[1*xstride];
2955
            const int q2 = pix[2*xstride];
2956

    
2957
            if( FFABS( p0 - q0 ) < alpha &&
2958
                FFABS( p1 - p0 ) < beta &&
2959
                FFABS( q1 - q0 ) < beta ) {
2960

    
2961
                int tc = tc0[i];
2962
                int i_delta;
2963

    
2964
                if( FFABS( p2 - p0 ) < beta ) {
2965
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2966
                    tc++;
2967
                }
2968
                if( FFABS( q2 - q0 ) < beta ) {
2969
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2970
                    tc++;
2971
                }
2972

    
2973
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2974
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2975
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2976
            }
2977
            pix += ystride;
2978
        }
2979
    }
2980
}
2981
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2982
{
2983
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2984
}
2985
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2986
{
2987
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2988
}
2989

    
2990
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2991
{
2992
    int i, d;
2993
    for( i = 0; i < 4; i++ ) {
2994
        const int tc = tc0[i];
2995
        if( tc <= 0 ) {
2996
            pix += 2*ystride;
2997
            continue;
2998
        }
2999
        for( d = 0; d < 2; d++ ) {
3000
            const int p0 = pix[-1*xstride];
3001
            const int p1 = pix[-2*xstride];
3002
            const int q0 = pix[0];
3003
            const int q1 = pix[1*xstride];
3004

    
3005
            if( FFABS( p0 - q0 ) < alpha &&
3006
                FFABS( p1 - p0 ) < beta &&
3007
                FFABS( q1 - q0 ) < beta ) {
3008

    
3009
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3010

    
3011
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3012
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3013
            }
3014
            pix += ystride;
3015
        }
3016
    }
3017
}
3018
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3019
{
3020
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3021
}
3022
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3023
{
3024
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3025
}
3026

    
3027
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3028
{
3029
    int d;
3030
    for( d = 0; d < 8; d++ ) {
3031
        const int p0 = pix[-1*xstride];
3032
        const int p1 = pix[-2*xstride];
3033
        const int q0 = pix[0];
3034
        const int q1 = pix[1*xstride];
3035

    
3036
        if( FFABS( p0 - q0 ) < alpha &&
3037
            FFABS( p1 - p0 ) < beta &&
3038
            FFABS( q1 - q0 ) < beta ) {
3039

    
3040
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3041
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3042
        }
3043
        pix += ystride;
3044
    }
3045
}
3046
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3047
{
3048
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3049
}
3050
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3051
{
3052
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3053
}
3054

    
3055
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3056
{
3057
    int s, i;
3058

    
3059
    s = 0;
3060
    for(i=0;i<h;i++) {
3061
        s += abs(pix1[0] - pix2[0]);
3062
        s += abs(pix1[1] - pix2[1]);
3063
        s += abs(pix1[2] - pix2[2]);
3064
        s += abs(pix1[3] - pix2[3]);
3065
        s += abs(pix1[4] - pix2[4]);
3066
        s += abs(pix1[5] - pix2[5]);
3067
        s += abs(pix1[6] - pix2[6]);
3068
        s += abs(pix1[7] - pix2[7]);
3069
        s += abs(pix1[8] - pix2[8]);
3070
        s += abs(pix1[9] - pix2[9]);
3071
        s += abs(pix1[10] - pix2[10]);
3072
        s += abs(pix1[11] - pix2[11]);
3073
        s += abs(pix1[12] - pix2[12]);
3074
        s += abs(pix1[13] - pix2[13]);
3075
        s += abs(pix1[14] - pix2[14]);
3076
        s += abs(pix1[15] - pix2[15]);
3077
        pix1 += line_size;
3078
        pix2 += line_size;
3079
    }
3080
    return s;
3081
}
3082

    
3083
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3084
{
3085
    int s, i;
3086

    
3087
    s = 0;
3088
    for(i=0;i<h;i++) {
3089
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3090
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3091
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3092
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3093
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3094
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3095
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3096
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3097
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3098
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3099
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3100
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3101
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3102
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3103
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3104
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3105
        pix1 += line_size;
3106
        pix2 += line_size;
3107
    }
3108
    return s;
3109
}
3110

    
3111
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3112
{
3113
    int s, i;
3114
    uint8_t *pix3 = pix2 + line_size;
3115

    
3116
    s = 0;
3117
    for(i=0;i<h;i++) {
3118
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3119
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3120
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3121
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3122
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3123
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3124
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3125
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3126
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3127
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3128
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3129
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3130
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3131
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3132
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3133
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3134
        pix1 += line_size;
3135
        pix2 += line_size;
3136
        pix3 += line_size;
3137
    }
3138
    return s;
3139
}
3140

    
3141
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3142
{
3143
    int s, i;
3144
    uint8_t *pix3 = pix2 + line_size;
3145

    
3146
    s = 0;
3147
    for(i=0;i<h;i++) {
3148
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3149
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3150
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3151
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3152
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3153
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3154
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3155
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3156
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3157
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3158
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3159
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3160
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3161
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3162
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3163
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3164
        pix1 += line_size;
3165
        pix2 += line_size;
3166
        pix3 += line_size;
3167
    }
3168
    return s;
3169
}
3170

    
3171
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3172
{
3173
    int s, i;
3174

    
3175
    s = 0;
3176
    for(i=0;i<h;i++) {
3177
        s += abs(pix1[0] - pix2[0]);
3178
        s += abs(pix1[1] - pix2[1]);
3179
        s += abs(pix1[2] - pix2[2]);
3180
        s += abs(pix1[3] - pix2[3]);
3181
        s += abs(pix1[4] - pix2[4]);
3182
        s += abs(pix1[5] - pix2[5]);
3183
        s += abs(pix1[6] - pix2[6]);
3184
        s += abs(pix1[7] - pix2[7]);
3185
        pix1 += line_size;
3186
        pix2 += line_size;
3187
    }
3188
    return s;
3189
}
3190

    
3191
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3192
{
3193
    int s, i;
3194

    
3195
    s = 0;
3196
    for(i=0;i<h;i++) {
3197
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3198
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3199
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3200
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3201
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3202
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3203
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3204
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3205
        pix1 += line_size;
3206
        pix2 += line_size;
3207
    }
3208
    return s;
3209
}
3210

    
3211
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3212
{
3213
    int s, i;
3214
    uint8_t *pix3 = pix2 + line_size;
3215

    
3216
    s = 0;
3217
    for(i=0;i<h;i++) {
3218
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3219
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3220
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3221
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3222
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3223
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3224
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3225
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3226
        pix1 += line_size;
3227
        pix2 += line_size;
3228
        pix3 += line_size;
3229
    }
3230
    return s;
3231
}
3232

    
3233
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234
{
3235
    int s, i;
3236
    uint8_t *pix3 = pix2 + line_size;
3237

    
3238
    s = 0;
3239
    for(i=0;i<h;i++) {
3240
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3241
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3242
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3243
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3244
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3245
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3246
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3247
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3248
        pix1 += line_size;
3249
        pix2 += line_size;
3250
        pix3 += line_size;
3251
    }
3252
    return s;
3253
}
3254

    
3255
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3256
    MpegEncContext *c = v;
3257
    int score1=0;
3258
    int score2=0;
3259
    int x,y;
3260

    
3261
    for(y=0; y<h; y++){
3262
        for(x=0; x<16; x++){
3263
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3264
        }
3265
        if(y+1<h){
3266
            for(x=0; x<15; x++){
3267
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3268
                             - s1[x+1] + s1[x+1+stride])
3269
                        -FFABS(  s2[x  ] - s2[x  +stride]
3270
                             - s2[x+1] + s2[x+1+stride]);
3271
            }
3272
        }
3273
        s1+= stride;
3274
        s2+= stride;
3275
    }
3276

    
3277
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3278
    else  return score1 + FFABS(score2)*8;
3279
}
3280

    
3281
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3282
    MpegEncContext *c = v;
3283
    int score1=0;
3284
    int score2=0;
3285
    int x,y;
3286

    
3287
    for(y=0; y<h; y++){
3288
        for(x=0; x<8; x++){
3289
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3290
        }
3291
        if(y+1<h){
3292
            for(x=0; x<7; x++){
3293
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3294
                             - s1[x+1] + s1[x+1+stride])
3295
                        -FFABS(  s2[x  ] - s2[x  +stride]
3296
                             - s2[x+1] + s2[x+1+stride]);
3297
            }
3298
        }
3299
        s1+= stride;
3300
        s2+= stride;
3301
    }
3302

    
3303
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3304
    else  return score1 + FFABS(score2)*8;
3305
}
3306

    
3307
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3308
    int i;
3309
    unsigned int sum=0;
3310

    
3311
    for(i=0; i<8*8; i++){
3312
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3313
        int w= weight[i];
3314
        b>>= RECON_SHIFT;
3315
        assert(-512<b && b<512);
3316

    
3317
        sum += (w*b)*(w*b)>>4;
3318
    }
3319
    return sum>>2;
3320
}
3321

    
3322
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3323
    int i;
3324

    
3325
    for(i=0; i<8*8; i++){
3326
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3327
    }
3328
}
3329

    
3330
/**
3331
 * permutes an 8x8 block.
3332
 * @param block the block which will be permuted according to the given permutation vector
3333
 * @param permutation the permutation vector
3334
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3335
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3336
 *                  (inverse) permutated to scantable order!
3337
 */
3338
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3339
{
3340
    int i;
3341
    DCTELEM temp[64];
3342

    
3343
    if(last<=0) return;
3344
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3345

    
3346
    for(i=0; i<=last; i++){
3347
        const int j= scantable[i];
3348
        temp[j]= block[j];
3349
        block[j]=0;
3350
    }
3351

    
3352
    for(i=0; i<=last; i++){
3353
        const int j= scantable[i];
3354
        const int perm_j= permutation[j];
3355
        block[perm_j]= temp[j];
3356
    }
3357
}
3358

    
3359
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3360
    return 0;
3361
}
3362

    
3363
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3364
    int i;
3365

    
3366
    memset(cmp, 0, sizeof(void*)*5);
3367

    
3368
    for(i=0; i<5; i++){
3369
        switch(type&0xFF){
3370
        case FF_CMP_SAD:
3371
            cmp[i]= c->sad[i];
3372
            break;
3373
        case FF_CMP_SATD:
3374
            cmp[i]= c->hadamard8_diff[i];
3375
            break;
3376
        case FF_CMP_SSE:
3377
            cmp[i]= c->sse[i];
3378
            break;
3379
        case FF_CMP_DCT:
3380
            cmp[i]= c->dct_sad[i];
3381
            break;
3382
        case FF_CMP_DCT264:
3383
            cmp[i]= c->dct264_sad[i];
3384
            break;
3385
        case FF_CMP_DCTMAX:
3386
            cmp[i]= c->dct_max[i];
3387
            break;
3388
        case FF_CMP_PSNR:
3389
            cmp[i]= c->quant_psnr[i];
3390
            break;
3391
        case FF_CMP_BIT:
3392
            cmp[i]= c->bit[i];
3393
            break;
3394
        case FF_CMP_RD:
3395
            cmp[i]= c->rd[i];
3396
            break;
3397
        case FF_CMP_VSAD:
3398
            cmp[i]= c->vsad[i];
3399
            break;
3400
        case FF_CMP_VSSE:
3401
            cmp[i]= c->vsse[i];
3402
            break;
3403
        case FF_CMP_ZERO:
3404
            cmp[i]= zero_cmp;
3405
            break;
3406
        case FF_CMP_NSSE:
3407
            cmp[i]= c->nsse[i];
3408
            break;
3409
#ifdef CONFIG_SNOW_ENCODER
3410
        case FF_CMP_W53:
3411
            cmp[i]= c->w53[i];
3412
            break;
3413
        case FF_CMP_W97:
3414
            cmp[i]= c->w97[i];
3415
            break;
3416
#endif
3417
        default:
3418
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3419
        }
3420
    }
3421
}
3422

    
3423
static void clear_block_c(DCTELEM *block)
3424
{
3425
    memset(block, 0, sizeof(DCTELEM)*64);
3426
}
3427

    
3428
/**
3429
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3430
 */
3431
static void clear_blocks_c(DCTELEM *blocks)
3432
{
3433
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3434
}
3435

    
3436
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3437
    long i;
3438
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3439
        long a = *(long*)(src+i);
3440
        long b = *(long*)(dst+i);
3441
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3442
    }
3443
    for(; i<w; i++)
3444
        dst[i+0] += src[i+0];
3445
}
3446

    
3447
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3448
    long i;
3449
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3450
        long a = *(long*)(src1+i);
3451
        long b = *(long*)(src2+i);
3452
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3453
    }
3454
    for(; i<w; i++)
3455
        dst[i] = src1[i]+src2[i];
3456
}
3457

    
3458
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3459
    long i;
3460
#ifndef HAVE_FAST_UNALIGNED
3461
    if((long)src2 & (sizeof(long)-1)){
3462
        for(i=0; i+7<w; i+=8){
3463
            dst[i+0] = src1[i+0]-src2[i+0];
3464
            dst[i+1] = src1[i+1]-src2[i+1];
3465
            dst[i+2] = src1[i+2]-src2[i+2];
3466
            dst[i+3] = src1[i+3]-src2[i+3];
3467
            dst[i+4] = src1[i+4]-src2[i+4];
3468
            dst[i+5] = src1[i+5]-src2[i+5];
3469
            dst[i+6] = src1[i+6]-src2[i+6];
3470
            dst[i+7] = src1[i+7]-src2[i+7];
3471
        }
3472
    }else
3473
#endif
3474
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3475
        long a = *(long*)(src1+i);
3476
        long b = *(long*)(src2+i);
3477
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3478
    }
3479
    for(; i<w; i++)
3480
        dst[i+0] = src1[i+0]-src2[i+0];
3481
}
3482

    
3483
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3484
    int i;
3485
    uint8_t l, lt;
3486

    
3487
    l= *left;
3488
    lt= *left_top;
3489

    
3490
    for(i=0; i<w; i++){
3491
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3492
        lt= src1[i];
3493
        l= src2[i];
3494
        dst[i]= l - pred;
3495
    }
3496

    
3497
    *left= l;
3498
    *left_top= lt;
3499
}
3500

    
3501
#define BUTTERFLY2(o1,o2,i1,i2) \
3502
o1= (i1)+(i2);\
3503
o2= (i1)-(i2);
3504

    
3505
#define BUTTERFLY1(x,y) \
3506
{\
3507
    int a,b;\
3508
    a= x;\
3509
    b= y;\
3510
    x= a+b;\
3511
    y= a-b;\
3512
}
3513

    
3514
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3515

    
3516
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3517
    int i;
3518
    int temp[64];
3519
    int sum=0;
3520

    
3521
    assert(h==8);
3522

    
3523
    for(i=0; i<8; i++){
3524
        //FIXME try pointer walks
3525
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3526
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3527
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3528
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3529

    
3530
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3531
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3532
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3533
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3534

    
3535
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3536
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3537
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3538
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3539
    }
3540

    
3541
    for(i=0; i<8; i++){
3542
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3543
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3544
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3545
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3546

    
3547
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3548
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3549
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3550
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3551

    
3552
        sum +=
3553
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3554
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3555
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3556
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3557
    }
3558
#if 0
3559
static int maxi=0;
3560
if(sum>maxi){
3561
    maxi=sum;
3562
    printf("MAX:%d\n", maxi);
3563
}
3564
#endif
3565
    return sum;
3566
}
3567

    
3568
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3569
    int i;
3570
    int temp[64];
3571
    int sum=0;
3572

    
3573
    assert(h==8);
3574

    
3575
    for(i=0; i<8; i++){
3576
        //FIXME try pointer walks
3577
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3578
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3579
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3580
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3581

    
3582
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3583
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3584
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3585
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3586

    
3587
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3588
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3589
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3590
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3591
    }
3592

    
3593
    for(i=0; i<8; i++){
3594
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3595
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3596
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3597
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3598

    
3599
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3600
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3601
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3602
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3603

    
3604
        sum +=
3605
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3606
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3607
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3608
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3609
    }
3610

    
3611
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3612

    
3613
    return sum;
3614
}
3615

    
3616
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3617
    MpegEncContext * const s= (MpegEncContext *)c;
3618
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3619
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3620

    
3621
    assert(h==8);
3622

    
3623
    s->dsp.diff_pixels(temp, src1, src2, stride);
3624
    s->dsp.fdct(temp);
3625
    return s->dsp.sum_abs_dctelem(temp);
3626
}
3627

    
3628
#ifdef CONFIG_GPL
3629
#define DCT8_1D {\
3630
    const int s07 = SRC(0) + SRC(7);\
3631
    const int s16 = SRC(1) + SRC(6);\
3632
    const int s25 = SRC(2) + SRC(5);\
3633
    const int s34 = SRC(3) + SRC(4);\
3634
    const int a0 = s07 + s34;\
3635
    const int a1 = s16 + s25;\
3636
    const int a2 = s07 - s34;\
3637
    const int a3 = s16 - s25;\
3638
    const int d07 = SRC(0) - SRC(7);\
3639
    const int d16 = SRC(1) - SRC(6);\
3640
    const int d25 = SRC(2) - SRC(5);\
3641
    const int d34 = SRC(3) - SRC(4);\
3642
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3643
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3644
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3645
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3646
    DST(0,  a0 + a1     ) ;\
3647
    DST(1,  a4 + (a7>>2)) ;\
3648
    DST(2,  a2 + (a3>>1)) ;\
3649
    DST(3,  a5 + (a6>>2)) ;\
3650
    DST(4,  a0 - a1     ) ;\
3651
    DST(5,  a6 - (a5>>2)) ;\
3652
    DST(6, (a2>>1) - a3 ) ;\
3653
    DST(7, (a4>>2) - a7 ) ;\
3654
}
3655

    
3656
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3657
    MpegEncContext * const s= (MpegEncContext *)c;
3658
    DCTELEM dct[8][8];
3659
    int i;
3660
    int sum=0;
3661

    
3662
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3663

    
3664
#define SRC(x) dct[i][x]
3665
#define DST(x,v) dct[i][x]= v
3666
    for( i = 0; i < 8; i++ )
3667
        DCT8_1D
3668
#undef SRC
3669
#undef DST
3670

    
3671
#define SRC(x) dct[x][i]
3672
#define DST(x,v) sum += FFABS(v)
3673
    for( i = 0; i < 8; i++ )
3674
        DCT8_1D
3675
#undef SRC
3676
#undef DST
3677
    return sum;
3678
}
3679
#endif
3680

    
3681
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3682
    MpegEncContext * const s= (MpegEncContext *)c;
3683
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3684
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3685
    int sum=0, i;
3686

    
3687
    assert(h==8);
3688

    
3689
    s->dsp.diff_pixels(temp, src1, src2, stride);
3690
    s->dsp.fdct(temp);
3691

    
3692
    for(i=0; i<64; i++)
3693
        sum= FFMAX(sum, FFABS(temp[i]));
3694

    
3695
    return sum;
3696
}
3697

    
3698
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3699
    MpegEncContext * const s= (MpegEncContext *)c;
3700
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3701
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3702
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3703
    int sum=0, i;
3704

    
3705
    assert(h==8);
3706
    s->mb_intra=0;
3707

    
3708
    s->dsp.diff_pixels(temp, src1, src2, stride);
3709

    
3710
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3711

    
3712
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3713
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3714
    ff_simple_idct(temp); //FIXME
3715

    
3716
    for(i=0; i<64; i++)
3717
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3718

    
3719
    return sum;
3720
}
3721

    
3722
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3723
    MpegEncContext * const s= (MpegEncContext *)c;
3724
    const uint8_t *scantable= s->intra_scantable.permutated;
3725
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3726
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3727
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3728
    uint8_t * const bak= (uint8_t*)aligned_bak;
3729
    int i, last, run, bits, level, distortion, start_i;
3730
    const int esc_length= s->ac_esc_length;
3731
    uint8_t * length;
3732
    uint8_t * last_length;
3733

    
3734
    assert(h==8);
3735

    
3736
    for(i=0; i<8; i++){
3737
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3738
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3739
    }
3740

    
3741
    s->dsp.diff_pixels(temp, src1, src2, stride);
3742

    
3743
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3744

    
3745
    bits=0;
3746

    
3747
    if (s->mb_intra) {
3748
        start_i = 1;
3749
        length     = s->intra_ac_vlc_length;
3750
        last_length= s->intra_ac_vlc_last_length;
3751
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3752
    } else {
3753
        start_i = 0;
3754
        length     = s->inter_ac_vlc_length;
3755
        last_length= s->inter_ac_vlc_last_length;
3756
    }
3757

    
3758
    if(last>=start_i){
3759
        run=0;
3760
        for(i=start_i; i<last; i++){
3761
            int j= scantable[i];
3762
            level= temp[j];
3763

    
3764
            if(level){
3765
                level+=64;
3766
                if((level&(~127)) == 0){
3767
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3768
                }else
3769
                    bits+= esc_length;
3770
                run=0;
3771
            }else
3772
                run++;
3773
        }
3774
        i= scantable[last];
3775

    
3776
        level= temp[i] + 64;
3777

    
3778
        assert(level - 64);
3779

    
3780
        if((level&(~127)) == 0){
3781
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3782
        }else
3783
            bits+= esc_length;
3784

    
3785
    }
3786

    
3787
    if(last>=0){
3788
        if(s->mb_intra)
3789
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3790
        else
3791
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3792
    }
3793

    
3794
    s->dsp.idct_add(bak, stride, temp);
3795

    
3796
    distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3797

    
3798
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3799
}
3800

    
3801
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802
    MpegEncContext * const s= (MpegEncContext *)c;
3803
    const uint8_t *scantable= s->intra_scantable.permutated;
3804
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3805
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3806
    int i, last, run, bits, level, start_i;
3807
    const int esc_length= s->ac_esc_length;
3808
    uint8_t * length;
3809
    uint8_t * last_length;
3810

    
3811
    assert(h==8);
3812

    
3813
    s->dsp.diff_pixels(temp, src1, src2, stride);
3814

    
3815
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3816

    
3817
    bits=0;
3818

    
3819
    if (s->mb_intra) {
3820
        start_i = 1;
3821
        length     = s->intra_ac_vlc_length;
3822
        last_length= s->intra_ac_vlc_last_length;
3823
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3824
    } else {
3825
        start_i = 0;
3826
        length     = s->inter_ac_vlc_length;
3827
        last_length= s->inter_ac_vlc_last_length;
3828
    }
3829

    
3830
    if(last>=start_i){
3831
        run=0;
3832
        for(i=start_i; i<last; i++){
3833
            int j= scantable[i];
3834
            level= temp[j];
3835

    
3836
            if(level){
3837
                level+=64;
3838
                if((level&(~127)) == 0){
3839
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3840
                }else
3841
                    bits+= esc_length;
3842
                run=0;
3843
            }else
3844
                run++;
3845
        }
3846
        i= scantable[last];
3847

    
3848
        level= temp[i] + 64;
3849

    
3850
        assert(level - 64);
3851

    
3852
        if((level&(~127)) == 0){
3853
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3854
        }else
3855
            bits+= esc_length;
3856
    }
3857

    
3858
    return bits;
3859
}
3860

    
3861
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3862
    int score=0;
3863
    int x,y;
3864

    
3865
    for(y=1; y<h; y++){
3866
        for(x=0; x<16; x+=4){
3867
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3868
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3869
        }
3870
        s+= stride;
3871
    }
3872

    
3873
    return score;
3874
}
3875

    
3876
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3877
    int score=0;
3878
    int x,y;
3879

    
3880
    for(y=1; y<h; y++){
3881
        for(x=0; x<16; x++){
3882
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3883
        }
3884
        s1+= stride;
3885
        s2+= stride;
3886
    }
3887

    
3888
    return score;
3889
}
3890

    
3891
#define SQ(a) ((a)*(a))
3892
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3893
    int score=0;
3894
    int x,y;
3895

    
3896
    for(y=1; y<h; y++){
3897
        for(x=0; x<16; x+=4){
3898
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3899
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3900
        }
3901
        s+= stride;
3902
    }
3903

    
3904
    return score;
3905
}
3906

    
3907
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3908
    int score=0;
3909
    int x,y;
3910

    
3911
    for(y=1; y<h; y++){
3912
        for(x=0; x<16; x++){
3913
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3914
        }
3915
        s1+= stride;
3916
        s2+= stride;
3917
    }
3918

    
3919
    return score;
3920
}
3921

    
3922
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3923
                               int size){
3924
    int score=0;
3925
    int i;
3926
    for(i=0; i<size; i++)
3927
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3928
    return score;
3929
}
3930

    
3931
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3932
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3933
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3934
#ifdef CONFIG_GPL
3935
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3936
#endif
3937
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3938
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3939
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3940
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3941

    
3942
static void vector_fmul_c(float *dst, const float *src, int len){
3943
    int i;
3944
    for(i=0; i<len; i++)
3945
        dst[i] *= src[i];
3946
}
3947

    
3948
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3949
    int i;
3950
    src1 += len-1;
3951
    for(i=0; i<len; i++)
3952
        dst[i] = src0[i] * src1[-i];
3953
}
3954

    
3955
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3956
    int i;
3957
    for(i=0; i<len; i++)
3958
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3959
}
3960

    
3961
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3962
    int i,j;
3963
    dst += len;
3964
    win += len;
3965
    src0+= len;
3966
    for(i=-len, j=len-1; i<0; i++, j--) {
3967
        float s0 = src0[i];
3968
        float s1 = src1[j];
3969
        float wi = win[i];
3970
        float wj = win[j];
3971
        dst[i] = s0*wj - s1*wi + add_bias;
3972
        dst[j] = s0*wi + s1*wj + add_bias;
3973
    }
3974
}
3975

    
3976
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3977
    int i;
3978
    for(i=0; i<len; i++)
3979
        dst[i] = src[i] * mul;
3980
}
3981

    
3982
static av_always_inline int float_to_int16_one(const float *src){
3983
    int_fast32_t tmp = *(const int32_t*)src;
3984
    if(tmp & 0xf0000){
3985
        tmp = (0x43c0ffff - tmp)>>31;
3986
        // is this faster on some gcc/cpu combinations?
3987
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3988
//      else                 tmp = 0;
3989
    }
3990
    return tmp - 0x8000;
3991
}
3992

    
3993
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3994
    int i;
3995
    for(i=0; i<len; i++)
3996
        dst[i] = float_to_int16_one(src+i);
3997
}
3998

    
3999
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4000
    int i,j,c;
4001
    if(channels==2){
4002
        for(i=0; i<len; i++){
4003
            dst[2*i]   = float_to_int16_one(src[0]+i);
4004
            dst[2*i+1] = float_to_int16_one(src[1]+i);
4005
        }
4006
    }else{
4007
        for(c=0; c<channels; c++)
4008
            for(i=0, j=c; i<len; i++, j+=channels)
4009
                dst[j] = float_to_int16_one(src[c]+i);
4010
    }
4011
}
4012

    
4013
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4014
{
4015
    while (order--)
4016
       *v1++ += *v2++;
4017
}
4018

    
4019
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4020
{
4021
    while (order--)
4022
        *v1++ -= *v2++;
4023
}
4024

    
4025
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4026
{
4027
    int res = 0;
4028

    
4029
    while (order--)
4030
        res += (*v1++ * *v2++) >> shift;
4031

    
4032
    return res;
4033
}
4034

    
4035
#define W0 2048
4036
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4037
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4038
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4039
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4040
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4041
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4042
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4043

    
4044
static void wmv2_idct_row(short * b)
4045
{
4046
    int s1,s2;
4047
    int a0,a1,a2,a3,a4,a5,a6,a7;
4048
    /*step 1*/
4049
    a1 = W1*b[1]+W7*b[7];
4050
    a7 = W7*b[1]-W1*b[7];
4051
    a5 = W5*b[5]+W3*b[3];
4052
    a3 = W3*b[5]-W5*b[3];
4053
    a2 = W2*b[2]+W6*b[6];
4054
    a6 = W6*b[2]-W2*b[6];
4055
    a0 = W0*b[0]+W0*b[4];
4056
    a4 = W0*b[0]-W0*b[4];
4057
    /*step 2*/
4058
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4059
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4060
    /*step 3*/
4061
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4062
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4063
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4064
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4065
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4066
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4067
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4068
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4069
}
4070
static void wmv2_idct_col(short * b)
4071
{
4072
    int s1,s2;
4073
    int a0,a1,a2,a3,a4,a5,a6,a7;
4074
    /*step 1, with extended precision*/
4075
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4076
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4077
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4078
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4079
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4080
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4081
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4082
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4083
    /*step 2*/
4084
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4085
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4086
    /*step 3*/
4087
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4088
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4089
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4090
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4091

    
4092
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4093
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4094
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4095
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4096
}
4097
void ff_wmv2_idct_c(short * block){
4098
    int i;
4099

    
4100
    for(i=0;i<64;i+=8){
4101
        wmv2_idct_row(block+i);
4102
    }
4103
    for(i=0;i<8;i++){
4104
        wmv2_idct_col(block+i);
4105
    }
4106
}
4107
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4108
 converted */
4109
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4110
{
4111
    ff_wmv2_idct_c(block);
4112
    put_pixels_clamped_c(block, dest, line_size);
4113
}
4114
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4115
{
4116
    ff_wmv2_idct_c(block);
4117
    add_pixels_clamped_c(block, dest, line_size);
4118
}
4119
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4120
{
4121
    j_rev_dct (block);
4122
    put_pixels_clamped_c(block, dest, line_size);
4123
}
4124
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4125
{
4126
    j_rev_dct (block);
4127
    add_pixels_clamped_c(block, dest, line_size);
4128
}
4129

    
4130
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4131
{
4132
    j_rev_dct4 (block);
4133
    put_pixels_clamped4_c(block, dest, line_size);
4134
}
4135
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4136
{
4137
    j_rev_dct4 (block);
4138
    add_pixels_clamped4_c(block, dest, line_size);
4139
}
4140

    
4141
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4142
{
4143
    j_rev_dct2 (block);
4144
    put_pixels_clamped2_c(block, dest, line_size);
4145
}
4146
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4147
{
4148
    j_rev_dct2 (block);
4149
    add_pixels_clamped2_c(block, dest, line_size);
4150
}
4151

    
4152
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4153
{
4154
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4155

    
4156
    dest[0] = cm[(block[0] + 4)>>3];
4157
}
4158
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4159
{
4160
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4161

    
4162
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4163
}
4164

    
4165
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4166

    
4167
/* init static data */
4168
void dsputil_static_init(void)
4169
{
4170
    int i;
4171

    
4172
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4173
    for(i=0;i<MAX_NEG_CROP;i++) {
4174
        ff_cropTbl[i] = 0;
4175
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4176
    }
4177

    
4178
    for(i=0;i<512;i++) {
4179
        ff_squareTbl[i] = (i - 256) * (i - 256);
4180
    }
4181

    
4182
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4183
}
4184

    
4185
int ff_check_alignment(void){
4186
    static int did_fail=0;
4187
    DECLARE_ALIGNED_16(int, aligned);
4188

    
4189
    if((long)&aligned & 15){
4190
        if(!did_fail){
4191
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4192
            av_log(NULL, AV_LOG_ERROR,
4193
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4194
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4195
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4196
                "Do not report crashes to FFmpeg developers.\n");
4197
#endif
4198
            did_fail=1;
4199
        }
4200
        return -1;
4201
    }
4202
    return 0;
4203
}
4204

    
4205
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4206
{
4207
    int i;
4208

    
4209
    ff_check_alignment();
4210

    
4211
#ifdef CONFIG_ENCODERS
4212
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4213
        c->fdct = fdct_ifast;
4214
        c->fdct248 = fdct_ifast248;
4215
    }
4216
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4217
        c->fdct = ff_faandct;
4218
        c->fdct248 = ff_faandct248;
4219
    }
4220
    else {
4221
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4222
        c->fdct248 = ff_fdct248_islow;
4223
    }
4224
#endif //CONFIG_ENCODERS
4225

    
4226
    if(avctx->lowres==1){
4227
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4228
            c->idct_put= ff_jref_idct4_put;
4229
            c->idct_add= ff_jref_idct4_add;
4230
        }else{
4231
            c->idct_put= ff_h264_lowres_idct_put_c;
4232
            c->idct_add= ff_h264_lowres_idct_add_c;
4233
        }
4234
        c->idct    = j_rev_dct4;
4235
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4236
    }else if(avctx->lowres==2){
4237
        c->idct_put= ff_jref_idct2_put;
4238
        c->idct_add= ff_jref_idct2_add;
4239
        c->idct    = j_rev_dct2;
4240
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4241
    }else if(avctx->lowres==3){
4242
        c->idct_put= ff_jref_idct1_put;
4243
        c->idct_add= ff_jref_idct1_add;
4244
        c->idct    = j_rev_dct1;
4245
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4246
    }else{
4247
        if(avctx->idct_algo==FF_IDCT_INT){
4248
            c->idct_put= ff_jref_idct_put;
4249
            c->idct_add= ff_jref_idct_add;