Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ f27e1d64

History | View | Annotate | Download (160 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* flacenc.c */
45
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
46

    
47
/* pngdec.c */
48
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
49

    
50
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51
uint32_t ff_squareTbl[512] = {0, };
52

    
53
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
54
#define pb_7f (~0UL/255 * 0x7f)
55
#define pb_80 (~0UL/255 * 0x80)
56

    
57
const uint8_t ff_zigzag_direct[64] = {
58
    0,   1,  8, 16,  9,  2,  3, 10,
59
    17, 24, 32, 25, 18, 11,  4,  5,
60
    12, 19, 26, 33, 40, 48, 41, 34,
61
    27, 20, 13,  6,  7, 14, 21, 28,
62
    35, 42, 49, 56, 57, 50, 43, 36,
63
    29, 22, 15, 23, 30, 37, 44, 51,
64
    58, 59, 52, 45, 38, 31, 39, 46,
65
    53, 60, 61, 54, 47, 55, 62, 63
66
};
67

    
68
/* Specific zigzag scan for 248 idct. NOTE that unlike the
69
   specification, we interleave the fields */
70
const uint8_t ff_zigzag248_direct[64] = {
71
     0,  8,  1,  9, 16, 24,  2, 10,
72
    17, 25, 32, 40, 48, 56, 33, 41,
73
    18, 26,  3, 11,  4, 12, 19, 27,
74
    34, 42, 49, 57, 50, 58, 35, 43,
75
    20, 28,  5, 13,  6, 14, 21, 29,
76
    36, 44, 51, 59, 52, 60, 37, 45,
77
    22, 30,  7, 15, 23, 31, 38, 46,
78
    53, 61, 54, 62, 39, 47, 55, 63,
79
};
80

    
81
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
82
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
83

    
84
const uint8_t ff_alternate_horizontal_scan[64] = {
85
    0,  1,   2,  3,  8,  9, 16, 17,
86
    10, 11,  4,  5,  6,  7, 15, 14,
87
    13, 12, 19, 18, 24, 25, 32, 33,
88
    26, 27, 20, 21, 22, 23, 28, 29,
89
    30, 31, 34, 35, 40, 41, 48, 49,
90
    42, 43, 36, 37, 38, 39, 44, 45,
91
    46, 47, 50, 51, 56, 57, 58, 59,
92
    52, 53, 54, 55, 60, 61, 62, 63,
93
};
94

    
95
const uint8_t ff_alternate_vertical_scan[64] = {
96
    0,  8,  16, 24,  1,  9,  2, 10,
97
    17, 25, 32, 40, 48, 56, 57, 49,
98
    41, 33, 26, 18,  3, 11,  4, 12,
99
    19, 27, 34, 42, 50, 58, 35, 43,
100
    51, 59, 20, 28,  5, 13,  6, 14,
101
    21, 29, 36, 44, 52, 60, 37, 45,
102
    53, 61, 22, 30,  7, 15, 23, 31,
103
    38, 46, 54, 62, 39, 47, 55, 63,
104
};
105

    
106
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
107
const uint32_t ff_inverse[256]={
108
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
109
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
110
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
111
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
112
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
113
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
114
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
115
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
116
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
117
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
118
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
119
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
120
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
121
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
122
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
123
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
124
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
125
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
126
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
127
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
128
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
129
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
130
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
131
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
132
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
133
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
134
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
135
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
136
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
137
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
138
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
139
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
140
};
141

    
142
/* Input permutation for the simple_idct_mmx */
143
static const uint8_t simple_mmx_permutation[64]={
144
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
145
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
146
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
147
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
148
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
149
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
150
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
151
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
152
};
153

    
154
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
155

    
156
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
157
    int i;
158
    int end;
159

    
160
    st->scantable= src_scantable;
161

    
162
    for(i=0; i<64; i++){
163
        int j;
164
        j = src_scantable[i];
165
        st->permutated[i] = permutation[j];
166
#ifdef ARCH_POWERPC
167
        st->inverse[j] = i;
168
#endif
169
    }
170

    
171
    end=-1;
172
    for(i=0; i<64; i++){
173
        int j;
174
        j = st->permutated[i];
175
        if(j>end) end=j;
176
        st->raster_end[i]= end;
177
    }
178
}
179

    
180
static int pix_sum_c(uint8_t * pix, int line_size)
181
{
182
    int s, i, j;
183

    
184
    s = 0;
185
    for (i = 0; i < 16; i++) {
186
        for (j = 0; j < 16; j += 8) {
187
            s += pix[0];
188
            s += pix[1];
189
            s += pix[2];
190
            s += pix[3];
191
            s += pix[4];
192
            s += pix[5];
193
            s += pix[6];
194
            s += pix[7];
195
            pix += 8;
196
        }
197
        pix += line_size - 16;
198
    }
199
    return s;
200
}
201

    
202
static int pix_norm1_c(uint8_t * pix, int line_size)
203
{
204
    int s, i, j;
205
    uint32_t *sq = ff_squareTbl + 256;
206

    
207
    s = 0;
208
    for (i = 0; i < 16; i++) {
209
        for (j = 0; j < 16; j += 8) {
210
#if 0
211
            s += sq[pix[0]];
212
            s += sq[pix[1]];
213
            s += sq[pix[2]];
214
            s += sq[pix[3]];
215
            s += sq[pix[4]];
216
            s += sq[pix[5]];
217
            s += sq[pix[6]];
218
            s += sq[pix[7]];
219
#else
220
#if LONG_MAX > 2147483647
221
            register uint64_t x=*(uint64_t*)pix;
222
            s += sq[x&0xff];
223
            s += sq[(x>>8)&0xff];
224
            s += sq[(x>>16)&0xff];
225
            s += sq[(x>>24)&0xff];
226
            s += sq[(x>>32)&0xff];
227
            s += sq[(x>>40)&0xff];
228
            s += sq[(x>>48)&0xff];
229
            s += sq[(x>>56)&0xff];
230
#else
231
            register uint32_t x=*(uint32_t*)pix;
232
            s += sq[x&0xff];
233
            s += sq[(x>>8)&0xff];
234
            s += sq[(x>>16)&0xff];
235
            s += sq[(x>>24)&0xff];
236
            x=*(uint32_t*)(pix+4);
237
            s += sq[x&0xff];
238
            s += sq[(x>>8)&0xff];
239
            s += sq[(x>>16)&0xff];
240
            s += sq[(x>>24)&0xff];
241
#endif
242
#endif
243
            pix += 8;
244
        }
245
        pix += line_size - 16;
246
    }
247
    return s;
248
}
249

    
250
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
251
    int i;
252

    
253
    for(i=0; i+8<=w; i+=8){
254
        dst[i+0]= bswap_32(src[i+0]);
255
        dst[i+1]= bswap_32(src[i+1]);
256
        dst[i+2]= bswap_32(src[i+2]);
257
        dst[i+3]= bswap_32(src[i+3]);
258
        dst[i+4]= bswap_32(src[i+4]);
259
        dst[i+5]= bswap_32(src[i+5]);
260
        dst[i+6]= bswap_32(src[i+6]);
261
        dst[i+7]= bswap_32(src[i+7]);
262
    }
263
    for(;i<w; i++){
264
        dst[i+0]= bswap_32(src[i+0]);
265
    }
266
}
267

    
268
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
269
{
270
    int s, i;
271
    uint32_t *sq = ff_squareTbl + 256;
272

    
273
    s = 0;
274
    for (i = 0; i < h; i++) {
275
        s += sq[pix1[0] - pix2[0]];
276
        s += sq[pix1[1] - pix2[1]];
277
        s += sq[pix1[2] - pix2[2]];
278
        s += sq[pix1[3] - pix2[3]];
279
        pix1 += line_size;
280
        pix2 += line_size;
281
    }
282
    return s;
283
}
284

    
285
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
286
{
287
    int s, i;
288
    uint32_t *sq = ff_squareTbl + 256;
289

    
290
    s = 0;
291
    for (i = 0; i < h; i++) {
292
        s += sq[pix1[0] - pix2[0]];
293
        s += sq[pix1[1] - pix2[1]];
294
        s += sq[pix1[2] - pix2[2]];
295
        s += sq[pix1[3] - pix2[3]];
296
        s += sq[pix1[4] - pix2[4]];
297
        s += sq[pix1[5] - pix2[5]];
298
        s += sq[pix1[6] - pix2[6]];
299
        s += sq[pix1[7] - pix2[7]];
300
        pix1 += line_size;
301
        pix2 += line_size;
302
    }
303
    return s;
304
}
305

    
306
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
307
{
308
    int s, i;
309
    uint32_t *sq = ff_squareTbl + 256;
310

    
311
    s = 0;
312
    for (i = 0; i < h; i++) {
313
        s += sq[pix1[ 0] - pix2[ 0]];
314
        s += sq[pix1[ 1] - pix2[ 1]];
315
        s += sq[pix1[ 2] - pix2[ 2]];
316
        s += sq[pix1[ 3] - pix2[ 3]];
317
        s += sq[pix1[ 4] - pix2[ 4]];
318
        s += sq[pix1[ 5] - pix2[ 5]];
319
        s += sq[pix1[ 6] - pix2[ 6]];
320
        s += sq[pix1[ 7] - pix2[ 7]];
321
        s += sq[pix1[ 8] - pix2[ 8]];
322
        s += sq[pix1[ 9] - pix2[ 9]];
323
        s += sq[pix1[10] - pix2[10]];
324
        s += sq[pix1[11] - pix2[11]];
325
        s += sq[pix1[12] - pix2[12]];
326
        s += sq[pix1[13] - pix2[13]];
327
        s += sq[pix1[14] - pix2[14]];
328
        s += sq[pix1[15] - pix2[15]];
329

    
330
        pix1 += line_size;
331
        pix2 += line_size;
332
    }
333
    return s;
334
}
335

    
336

    
337
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
338
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
339
    int s, i, j;
340
    const int dec_count= w==8 ? 3 : 4;
341
    int tmp[32*32];
342
    int level, ori;
343
    static const int scale[2][2][4][4]={
344
      {
345
        {
346
            // 9/7 8x8 dec=3
347
            {268, 239, 239, 213},
348
            {  0, 224, 224, 152},
349
            {  0, 135, 135, 110},
350
        },{
351
            // 9/7 16x16 or 32x32 dec=4
352
            {344, 310, 310, 280},
353
            {  0, 320, 320, 228},
354
            {  0, 175, 175, 136},
355
            {  0, 129, 129, 102},
356
        }
357
      },{
358
        {
359
            // 5/3 8x8 dec=3
360
            {275, 245, 245, 218},
361
            {  0, 230, 230, 156},
362
            {  0, 138, 138, 113},
363
        },{
364
            // 5/3 16x16 or 32x32 dec=4
365
            {352, 317, 317, 286},
366
            {  0, 328, 328, 233},
367
            {  0, 180, 180, 140},
368
            {  0, 132, 132, 105},
369
        }
370
      }
371
    };
372

    
373
    for (i = 0; i < h; i++) {
374
        for (j = 0; j < w; j+=4) {
375
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
376
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
377
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
378
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
379
        }
380
        pix1 += line_size;
381
        pix2 += line_size;
382
    }
383

    
384
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
385

    
386
    s=0;
387
    assert(w==h);
388
    for(level=0; level<dec_count; level++){
389
        for(ori= level ? 1 : 0; ori<4; ori++){
390
            int size= w>>(dec_count-level);
391
            int sx= (ori&1) ? size : 0;
392
            int stride= 32<<(dec_count-level);
393
            int sy= (ori&2) ? stride>>1 : 0;
394

    
395
            for(i=0; i<size; i++){
396
                for(j=0; j<size; j++){
397
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
398
                    s += FFABS(v);
399
                }
400
            }
401
        }
402
    }
403
    assert(s>=0);
404
    return s>>9;
405
}
406

    
407
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
408
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
409
}
410

    
411
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
412
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
413
}
414

    
415
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
416
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
417
}
418

    
419
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
420
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
421
}
422

    
423
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
424
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
425
}
426

    
427
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
428
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
429
}
430
#endif
431

    
432
/* draw the edges of width 'w' of an image of size width, height */
433
//FIXME check that this is ok for mpeg4 interlaced
434
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
435
{
436
    uint8_t *ptr, *last_line;
437
    int i;
438

    
439
    last_line = buf + (height - 1) * wrap;
440
    for(i=0;i<w;i++) {
441
        /* top and bottom */
442
        memcpy(buf - (i + 1) * wrap, buf, width);
443
        memcpy(last_line + (i + 1) * wrap, last_line, width);
444
    }
445
    /* left and right */
446
    ptr = buf;
447
    for(i=0;i<height;i++) {
448
        memset(ptr - w, ptr[0], w);
449
        memset(ptr + width, ptr[width-1], w);
450
        ptr += wrap;
451
    }
452
    /* corners */
453
    for(i=0;i<w;i++) {
454
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
455
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
456
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
457
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
458
    }
459
}
460

    
461
/**
462
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
463
 * @param buf destination buffer
464
 * @param src source buffer
465
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
466
 * @param block_w width of block
467
 * @param block_h height of block
468
 * @param src_x x coordinate of the top left sample of the block in the source buffer
469
 * @param src_y y coordinate of the top left sample of the block in the source buffer
470
 * @param w width of the source buffer
471
 * @param h height of the source buffer
472
 */
473
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
474
                                    int src_x, int src_y, int w, int h){
475
    int x, y;
476
    int start_y, start_x, end_y, end_x;
477

    
478
    if(src_y>= h){
479
        src+= (h-1-src_y)*linesize;
480
        src_y=h-1;
481
    }else if(src_y<=-block_h){
482
        src+= (1-block_h-src_y)*linesize;
483
        src_y=1-block_h;
484
    }
485
    if(src_x>= w){
486
        src+= (w-1-src_x);
487
        src_x=w-1;
488
    }else if(src_x<=-block_w){
489
        src+= (1-block_w-src_x);
490
        src_x=1-block_w;
491
    }
492

    
493
    start_y= FFMAX(0, -src_y);
494
    start_x= FFMAX(0, -src_x);
495
    end_y= FFMIN(block_h, h-src_y);
496
    end_x= FFMIN(block_w, w-src_x);
497

    
498
    // copy existing part
499
    for(y=start_y; y<end_y; y++){
500
        for(x=start_x; x<end_x; x++){
501
            buf[x + y*linesize]= src[x + y*linesize];
502
        }
503
    }
504

    
505
    //top
506
    for(y=0; y<start_y; y++){
507
        for(x=start_x; x<end_x; x++){
508
            buf[x + y*linesize]= buf[x + start_y*linesize];
509
        }
510
    }
511

    
512
    //bottom
513
    for(y=end_y; y<block_h; y++){
514
        for(x=start_x; x<end_x; x++){
515
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
516
        }
517
    }
518

    
519
    for(y=0; y<block_h; y++){
520
       //left
521
        for(x=0; x<start_x; x++){
522
            buf[x + y*linesize]= buf[start_x + y*linesize];
523
        }
524

    
525
       //right
526
        for(x=end_x; x<block_w; x++){
527
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
528
        }
529
    }
530
}
531

    
532
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
533
{
534
    int i;
535

    
536
    /* read the pixels */
537
    for(i=0;i<8;i++) {
538
        block[0] = pixels[0];
539
        block[1] = pixels[1];
540
        block[2] = pixels[2];
541
        block[3] = pixels[3];
542
        block[4] = pixels[4];
543
        block[5] = pixels[5];
544
        block[6] = pixels[6];
545
        block[7] = pixels[7];
546
        pixels += line_size;
547
        block += 8;
548
    }
549
}
550

    
551
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
552
                          const uint8_t *s2, int stride){
553
    int i;
554

    
555
    /* read the pixels */
556
    for(i=0;i<8;i++) {
557
        block[0] = s1[0] - s2[0];
558
        block[1] = s1[1] - s2[1];
559
        block[2] = s1[2] - s2[2];
560
        block[3] = s1[3] - s2[3];
561
        block[4] = s1[4] - s2[4];
562
        block[5] = s1[5] - s2[5];
563
        block[6] = s1[6] - s2[6];
564
        block[7] = s1[7] - s2[7];
565
        s1 += stride;
566
        s2 += stride;
567
        block += 8;
568
    }
569
}
570

    
571

    
572
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
573
                                 int line_size)
574
{
575
    int i;
576
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
577

    
578
    /* read the pixels */
579
    for(i=0;i<8;i++) {
580
        pixels[0] = cm[block[0]];
581
        pixels[1] = cm[block[1]];
582
        pixels[2] = cm[block[2]];
583
        pixels[3] = cm[block[3]];
584
        pixels[4] = cm[block[4]];
585
        pixels[5] = cm[block[5]];
586
        pixels[6] = cm[block[6]];
587
        pixels[7] = cm[block[7]];
588

    
589
        pixels += line_size;
590
        block += 8;
591
    }
592
}
593

    
594
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
595
                                 int line_size)
596
{
597
    int i;
598
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
599

    
600
    /* read the pixels */
601
    for(i=0;i<4;i++) {
602
        pixels[0] = cm[block[0]];
603
        pixels[1] = cm[block[1]];
604
        pixels[2] = cm[block[2]];
605
        pixels[3] = cm[block[3]];
606

    
607
        pixels += line_size;
608
        block += 8;
609
    }
610
}
611

    
612
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
613
                                 int line_size)
614
{
615
    int i;
616
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
617

    
618
    /* read the pixels */
619
    for(i=0;i<2;i++) {
620
        pixels[0] = cm[block[0]];
621
        pixels[1] = cm[block[1]];
622

    
623
        pixels += line_size;
624
        block += 8;
625
    }
626
}
627

    
628
static void put_signed_pixels_clamped_c(const DCTELEM *block,
629
                                        uint8_t *restrict pixels,
630
                                        int line_size)
631
{
632
    int i, j;
633

    
634
    for (i = 0; i < 8; i++) {
635
        for (j = 0; j < 8; j++) {
636
            if (*block < -128)
637
                *pixels = 0;
638
            else if (*block > 127)
639
                *pixels = 255;
640
            else
641
                *pixels = (uint8_t)(*block + 128);
642
            block++;
643
            pixels++;
644
        }
645
        pixels += (line_size - 8);
646
    }
647
}
648

    
649
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
650
                          int line_size)
651
{
652
    int i;
653
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
654

    
655
    /* read the pixels */
656
    for(i=0;i<8;i++) {
657
        pixels[0] = cm[pixels[0] + block[0]];
658
        pixels[1] = cm[pixels[1] + block[1]];
659
        pixels[2] = cm[pixels[2] + block[2]];
660
        pixels[3] = cm[pixels[3] + block[3]];
661
        pixels[4] = cm[pixels[4] + block[4]];
662
        pixels[5] = cm[pixels[5] + block[5]];
663
        pixels[6] = cm[pixels[6] + block[6]];
664
        pixels[7] = cm[pixels[7] + block[7]];
665
        pixels += line_size;
666
        block += 8;
667
    }
668
}
669

    
670
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
671
                          int line_size)
672
{
673
    int i;
674
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
675

    
676
    /* read the pixels */
677
    for(i=0;i<4;i++) {
678
        pixels[0] = cm[pixels[0] + block[0]];
679
        pixels[1] = cm[pixels[1] + block[1]];
680
        pixels[2] = cm[pixels[2] + block[2]];
681
        pixels[3] = cm[pixels[3] + block[3]];
682
        pixels += line_size;
683
        block += 8;
684
    }
685
}
686

    
687
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
688
                          int line_size)
689
{
690
    int i;
691
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
692

    
693
    /* read the pixels */
694
    for(i=0;i<2;i++) {
695
        pixels[0] = cm[pixels[0] + block[0]];
696
        pixels[1] = cm[pixels[1] + block[1]];
697
        pixels += line_size;
698
        block += 8;
699
    }
700
}
701

    
702
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
703
{
704
    int i;
705
    for(i=0;i<8;i++) {
706
        pixels[0] += block[0];
707
        pixels[1] += block[1];
708
        pixels[2] += block[2];
709
        pixels[3] += block[3];
710
        pixels[4] += block[4];
711
        pixels[5] += block[5];
712
        pixels[6] += block[6];
713
        pixels[7] += block[7];
714
        pixels += line_size;
715
        block += 8;
716
    }
717
}
718

    
719
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
720
{
721
    int i;
722
    for(i=0;i<4;i++) {
723
        pixels[0] += block[0];
724
        pixels[1] += block[1];
725
        pixels[2] += block[2];
726
        pixels[3] += block[3];
727
        pixels += line_size;
728
        block += 4;
729
    }
730
}
731

    
732
static int sum_abs_dctelem_c(DCTELEM *block)
733
{
734
    int sum=0, i;
735
    for(i=0; i<64; i++)
736
        sum+= FFABS(block[i]);
737
    return sum;
738
}
739

    
740
#if 0
741

742
#define PIXOP2(OPNAME, OP) \
743
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
744
{\
745
    int i;\
746
    for(i=0; i<h; i++){\
747
        OP(*((uint64_t*)block), AV_RN64(pixels));\
748
        pixels+=line_size;\
749
        block +=line_size;\
750
    }\
751
}\
752
\
753
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
754
{\
755
    int i;\
756
    for(i=0; i<h; i++){\
757
        const uint64_t a= AV_RN64(pixels  );\
758
        const uint64_t b= AV_RN64(pixels+1);\
759
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
760
        pixels+=line_size;\
761
        block +=line_size;\
762
    }\
763
}\
764
\
765
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
766
{\
767
    int i;\
768
    for(i=0; i<h; i++){\
769
        const uint64_t a= AV_RN64(pixels  );\
770
        const uint64_t b= AV_RN64(pixels+1);\
771
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
772
        pixels+=line_size;\
773
        block +=line_size;\
774
    }\
775
}\
776
\
777
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
778
{\
779
    int i;\
780
    for(i=0; i<h; i++){\
781
        const uint64_t a= AV_RN64(pixels          );\
782
        const uint64_t b= AV_RN64(pixels+line_size);\
783
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
784
        pixels+=line_size;\
785
        block +=line_size;\
786
    }\
787
}\
788
\
789
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
790
{\
791
    int i;\
792
    for(i=0; i<h; i++){\
793
        const uint64_t a= AV_RN64(pixels          );\
794
        const uint64_t b= AV_RN64(pixels+line_size);\
795
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
796
        pixels+=line_size;\
797
        block +=line_size;\
798
    }\
799
}\
800
\
801
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
802
{\
803
        int i;\
804
        const uint64_t a= AV_RN64(pixels  );\
805
        const uint64_t b= AV_RN64(pixels+1);\
806
        uint64_t l0=  (a&0x0303030303030303ULL)\
807
                    + (b&0x0303030303030303ULL)\
808
                    + 0x0202020202020202ULL;\
809
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
810
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
811
        uint64_t l1,h1;\
812
\
813
        pixels+=line_size;\
814
        for(i=0; i<h; i+=2){\
815
            uint64_t a= AV_RN64(pixels  );\
816
            uint64_t b= AV_RN64(pixels+1);\
817
            l1=  (a&0x0303030303030303ULL)\
818
               + (b&0x0303030303030303ULL);\
819
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
821
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
822
            pixels+=line_size;\
823
            block +=line_size;\
824
            a= AV_RN64(pixels  );\
825
            b= AV_RN64(pixels+1);\
826
            l0=  (a&0x0303030303030303ULL)\
827
               + (b&0x0303030303030303ULL)\
828
               + 0x0202020202020202ULL;\
829
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
832
            pixels+=line_size;\
833
            block +=line_size;\
834
        }\
835
}\
836
\
837
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
838
{\
839
        int i;\
840
        const uint64_t a= AV_RN64(pixels  );\
841
        const uint64_t b= AV_RN64(pixels+1);\
842
        uint64_t l0=  (a&0x0303030303030303ULL)\
843
                    + (b&0x0303030303030303ULL)\
844
                    + 0x0101010101010101ULL;\
845
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
846
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
847
        uint64_t l1,h1;\
848
\
849
        pixels+=line_size;\
850
        for(i=0; i<h; i+=2){\
851
            uint64_t a= AV_RN64(pixels  );\
852
            uint64_t b= AV_RN64(pixels+1);\
853
            l1=  (a&0x0303030303030303ULL)\
854
               + (b&0x0303030303030303ULL);\
855
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
857
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
858
            pixels+=line_size;\
859
            block +=line_size;\
860
            a= AV_RN64(pixels  );\
861
            b= AV_RN64(pixels+1);\
862
            l0=  (a&0x0303030303030303ULL)\
863
               + (b&0x0303030303030303ULL)\
864
               + 0x0101010101010101ULL;\
865
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
868
            pixels+=line_size;\
869
            block +=line_size;\
870
        }\
871
}\
872
\
873
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
874
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
875
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
876
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
877
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
878
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
879
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
880

881
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
882
#else // 64 bit variant
883

    
884
#define PIXOP2(OPNAME, OP) \
885
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886
    int i;\
887
    for(i=0; i<h; i++){\
888
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
889
        pixels+=line_size;\
890
        block +=line_size;\
891
    }\
892
}\
893
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894
    int i;\
895
    for(i=0; i<h; i++){\
896
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
897
        pixels+=line_size;\
898
        block +=line_size;\
899
    }\
900
}\
901
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902
    int i;\
903
    for(i=0; i<h; i++){\
904
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
905
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
906
        pixels+=line_size;\
907
        block +=line_size;\
908
    }\
909
}\
910
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
911
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
912
}\
913
\
914
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
915
                                                int src_stride1, int src_stride2, int h){\
916
    int i;\
917
    for(i=0; i<h; i++){\
918
        uint32_t a,b;\
919
        a= AV_RN32(&src1[i*src_stride1  ]);\
920
        b= AV_RN32(&src2[i*src_stride2  ]);\
921
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
922
        a= AV_RN32(&src1[i*src_stride1+4]);\
923
        b= AV_RN32(&src2[i*src_stride2+4]);\
924
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
925
    }\
926
}\
927
\
928
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
929
                                                int src_stride1, int src_stride2, int h){\
930
    int i;\
931
    for(i=0; i<h; i++){\
932
        uint32_t a,b;\
933
        a= AV_RN32(&src1[i*src_stride1  ]);\
934
        b= AV_RN32(&src2[i*src_stride2  ]);\
935
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
936
        a= AV_RN32(&src1[i*src_stride1+4]);\
937
        b= AV_RN32(&src2[i*src_stride2+4]);\
938
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
939
    }\
940
}\
941
\
942
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
943
                                                int src_stride1, int src_stride2, int h){\
944
    int i;\
945
    for(i=0; i<h; i++){\
946
        uint32_t a,b;\
947
        a= AV_RN32(&src1[i*src_stride1  ]);\
948
        b= AV_RN32(&src2[i*src_stride2  ]);\
949
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
950
    }\
951
}\
952
\
953
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
954
                                                int src_stride1, int src_stride2, int h){\
955
    int i;\
956
    for(i=0; i<h; i++){\
957
        uint32_t a,b;\
958
        a= AV_RN16(&src1[i*src_stride1  ]);\
959
        b= AV_RN16(&src2[i*src_stride2  ]);\
960
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
961
    }\
962
}\
963
\
964
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
965
                                                int src_stride1, int src_stride2, int h){\
966
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
967
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
968
}\
969
\
970
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971
                                                int src_stride1, int src_stride2, int h){\
972
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
973
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
974
}\
975
\
976
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
977
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
978
}\
979
\
980
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
981
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
982
}\
983
\
984
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
985
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
986
}\
987
\
988
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
989
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
990
}\
991
\
992
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
993
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
994
    int i;\
995
    for(i=0; i<h; i++){\
996
        uint32_t a, b, c, d, l0, l1, h0, h1;\
997
        a= AV_RN32(&src1[i*src_stride1]);\
998
        b= AV_RN32(&src2[i*src_stride2]);\
999
        c= AV_RN32(&src3[i*src_stride3]);\
1000
        d= AV_RN32(&src4[i*src_stride4]);\
1001
        l0=  (a&0x03030303UL)\
1002
           + (b&0x03030303UL)\
1003
           + 0x02020202UL;\
1004
        h0= ((a&0xFCFCFCFCUL)>>2)\
1005
          + ((b&0xFCFCFCFCUL)>>2);\
1006
        l1=  (c&0x03030303UL)\
1007
           + (d&0x03030303UL);\
1008
        h1= ((c&0xFCFCFCFCUL)>>2)\
1009
          + ((d&0xFCFCFCFCUL)>>2);\
1010
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011
        a= AV_RN32(&src1[i*src_stride1+4]);\
1012
        b= AV_RN32(&src2[i*src_stride2+4]);\
1013
        c= AV_RN32(&src3[i*src_stride3+4]);\
1014
        d= AV_RN32(&src4[i*src_stride4+4]);\
1015
        l0=  (a&0x03030303UL)\
1016
           + (b&0x03030303UL)\
1017
           + 0x02020202UL;\
1018
        h0= ((a&0xFCFCFCFCUL)>>2)\
1019
          + ((b&0xFCFCFCFCUL)>>2);\
1020
        l1=  (c&0x03030303UL)\
1021
           + (d&0x03030303UL);\
1022
        h1= ((c&0xFCFCFCFCUL)>>2)\
1023
          + ((d&0xFCFCFCFCUL)>>2);\
1024
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1025
    }\
1026
}\
1027
\
1028
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1029
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1030
}\
1031
\
1032
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1033
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1034
}\
1035
\
1036
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1037
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1038
}\
1039
\
1040
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1041
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1042
}\
1043
\
1044
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1045
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1046
    int i;\
1047
    for(i=0; i<h; i++){\
1048
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1049
        a= AV_RN32(&src1[i*src_stride1]);\
1050
        b= AV_RN32(&src2[i*src_stride2]);\
1051
        c= AV_RN32(&src3[i*src_stride3]);\
1052
        d= AV_RN32(&src4[i*src_stride4]);\
1053
        l0=  (a&0x03030303UL)\
1054
           + (b&0x03030303UL)\
1055
           + 0x01010101UL;\
1056
        h0= ((a&0xFCFCFCFCUL)>>2)\
1057
          + ((b&0xFCFCFCFCUL)>>2);\
1058
        l1=  (c&0x03030303UL)\
1059
           + (d&0x03030303UL);\
1060
        h1= ((c&0xFCFCFCFCUL)>>2)\
1061
          + ((d&0xFCFCFCFCUL)>>2);\
1062
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063
        a= AV_RN32(&src1[i*src_stride1+4]);\
1064
        b= AV_RN32(&src2[i*src_stride2+4]);\
1065
        c= AV_RN32(&src3[i*src_stride3+4]);\
1066
        d= AV_RN32(&src4[i*src_stride4+4]);\
1067
        l0=  (a&0x03030303UL)\
1068
           + (b&0x03030303UL)\
1069
           + 0x01010101UL;\
1070
        h0= ((a&0xFCFCFCFCUL)>>2)\
1071
          + ((b&0xFCFCFCFCUL)>>2);\
1072
        l1=  (c&0x03030303UL)\
1073
           + (d&0x03030303UL);\
1074
        h1= ((c&0xFCFCFCFCUL)>>2)\
1075
          + ((d&0xFCFCFCFCUL)>>2);\
1076
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077
    }\
1078
}\
1079
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1080
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1081
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1082
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1083
}\
1084
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1085
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1086
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1087
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088
}\
1089
\
1090
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1091
{\
1092
        int i, a0, b0, a1, b1;\
1093
        a0= pixels[0];\
1094
        b0= pixels[1] + 2;\
1095
        a0 += b0;\
1096
        b0 += pixels[2];\
1097
\
1098
        pixels+=line_size;\
1099
        for(i=0; i<h; i+=2){\
1100
            a1= pixels[0];\
1101
            b1= pixels[1];\
1102
            a1 += b1;\
1103
            b1 += pixels[2];\
1104
\
1105
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1106
            block[1]= (b1+b0)>>2;\
1107
\
1108
            pixels+=line_size;\
1109
            block +=line_size;\
1110
\
1111
            a0= pixels[0];\
1112
            b0= pixels[1] + 2;\
1113
            a0 += b0;\
1114
            b0 += pixels[2];\
1115
\
1116
            block[0]= (a1+a0)>>2;\
1117
            block[1]= (b1+b0)>>2;\
1118
            pixels+=line_size;\
1119
            block +=line_size;\
1120
        }\
1121
}\
1122
\
1123
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1124
{\
1125
        int i;\
1126
        const uint32_t a= AV_RN32(pixels  );\
1127
        const uint32_t b= AV_RN32(pixels+1);\
1128
        uint32_t l0=  (a&0x03030303UL)\
1129
                    + (b&0x03030303UL)\
1130
                    + 0x02020202UL;\
1131
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1132
                   + ((b&0xFCFCFCFCUL)>>2);\
1133
        uint32_t l1,h1;\
1134
\
1135
        pixels+=line_size;\
1136
        for(i=0; i<h; i+=2){\
1137
            uint32_t a= AV_RN32(pixels  );\
1138
            uint32_t b= AV_RN32(pixels+1);\
1139
            l1=  (a&0x03030303UL)\
1140
               + (b&0x03030303UL);\
1141
            h1= ((a&0xFCFCFCFCUL)>>2)\
1142
              + ((b&0xFCFCFCFCUL)>>2);\
1143
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1144
            pixels+=line_size;\
1145
            block +=line_size;\
1146
            a= AV_RN32(pixels  );\
1147
            b= AV_RN32(pixels+1);\
1148
            l0=  (a&0x03030303UL)\
1149
               + (b&0x03030303UL)\
1150
               + 0x02020202UL;\
1151
            h0= ((a&0xFCFCFCFCUL)>>2)\
1152
              + ((b&0xFCFCFCFCUL)>>2);\
1153
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154
            pixels+=line_size;\
1155
            block +=line_size;\
1156
        }\
1157
}\
1158
\
1159
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1160
{\
1161
    int j;\
1162
    for(j=0; j<2; j++){\
1163
        int i;\
1164
        const uint32_t a= AV_RN32(pixels  );\
1165
        const uint32_t b= AV_RN32(pixels+1);\
1166
        uint32_t l0=  (a&0x03030303UL)\
1167
                    + (b&0x03030303UL)\
1168
                    + 0x02020202UL;\
1169
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1170
                   + ((b&0xFCFCFCFCUL)>>2);\
1171
        uint32_t l1,h1;\
1172
\
1173
        pixels+=line_size;\
1174
        for(i=0; i<h; i+=2){\
1175
            uint32_t a= AV_RN32(pixels  );\
1176
            uint32_t b= AV_RN32(pixels+1);\
1177
            l1=  (a&0x03030303UL)\
1178
               + (b&0x03030303UL);\
1179
            h1= ((a&0xFCFCFCFCUL)>>2)\
1180
              + ((b&0xFCFCFCFCUL)>>2);\
1181
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1182
            pixels+=line_size;\
1183
            block +=line_size;\
1184
            a= AV_RN32(pixels  );\
1185
            b= AV_RN32(pixels+1);\
1186
            l0=  (a&0x03030303UL)\
1187
               + (b&0x03030303UL)\
1188
               + 0x02020202UL;\
1189
            h0= ((a&0xFCFCFCFCUL)>>2)\
1190
              + ((b&0xFCFCFCFCUL)>>2);\
1191
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1192
            pixels+=line_size;\
1193
            block +=line_size;\
1194
        }\
1195
        pixels+=4-line_size*(h+1);\
1196
        block +=4-line_size*h;\
1197
    }\
1198
}\
1199
\
1200
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1201
{\
1202
    int j;\
1203
    for(j=0; j<2; j++){\
1204
        int i;\
1205
        const uint32_t a= AV_RN32(pixels  );\
1206
        const uint32_t b= AV_RN32(pixels+1);\
1207
        uint32_t l0=  (a&0x03030303UL)\
1208
                    + (b&0x03030303UL)\
1209
                    + 0x01010101UL;\
1210
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1211
                   + ((b&0xFCFCFCFCUL)>>2);\
1212
        uint32_t l1,h1;\
1213
\
1214
        pixels+=line_size;\
1215
        for(i=0; i<h; i+=2){\
1216
            uint32_t a= AV_RN32(pixels  );\
1217
            uint32_t b= AV_RN32(pixels+1);\
1218
            l1=  (a&0x03030303UL)\
1219
               + (b&0x03030303UL);\
1220
            h1= ((a&0xFCFCFCFCUL)>>2)\
1221
              + ((b&0xFCFCFCFCUL)>>2);\
1222
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1223
            pixels+=line_size;\
1224
            block +=line_size;\
1225
            a= AV_RN32(pixels  );\
1226
            b= AV_RN32(pixels+1);\
1227
            l0=  (a&0x03030303UL)\
1228
               + (b&0x03030303UL)\
1229
               + 0x01010101UL;\
1230
            h0= ((a&0xFCFCFCFCUL)>>2)\
1231
              + ((b&0xFCFCFCFCUL)>>2);\
1232
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1233
            pixels+=line_size;\
1234
            block +=line_size;\
1235
        }\
1236
        pixels+=4-line_size*(h+1);\
1237
        block +=4-line_size*h;\
1238
    }\
1239
}\
1240
\
1241
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1242
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1243
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1244
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1245
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1246
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1247
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1248
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1249

    
1250
#define op_avg(a, b) a = rnd_avg32(a, b)
1251
#endif
1252
#define op_put(a, b) a = b
1253

    
1254
PIXOP2(avg, op_avg)
1255
PIXOP2(put, op_put)
1256
#undef op_avg
1257
#undef op_put
1258

    
1259
#define avg2(a,b) ((a+b+1)>>1)
1260
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1261

    
1262
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1263
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1264
}
1265

    
1266
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1267
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1268
}
1269

    
1270
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1271
{
1272
    const int A=(16-x16)*(16-y16);
1273
    const int B=(   x16)*(16-y16);
1274
    const int C=(16-x16)*(   y16);
1275
    const int D=(   x16)*(   y16);
1276
    int i;
1277

    
1278
    for(i=0; i<h; i++)
1279
    {
1280
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1281
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1282
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1283
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1284
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1285
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1286
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1287
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1288
        dst+= stride;
1289
        src+= stride;
1290
    }
1291
}
1292

    
1293
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1294
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1295
{
1296
    int y, vx, vy;
1297
    const int s= 1<<shift;
1298

    
1299
    width--;
1300
    height--;
1301

    
1302
    for(y=0; y<h; y++){
1303
        int x;
1304

    
1305
        vx= ox;
1306
        vy= oy;
1307
        for(x=0; x<8; x++){ //XXX FIXME optimize
1308
            int src_x, src_y, frac_x, frac_y, index;
1309

    
1310
            src_x= vx>>16;
1311
            src_y= vy>>16;
1312
            frac_x= src_x&(s-1);
1313
            frac_y= src_y&(s-1);
1314
            src_x>>=shift;
1315
            src_y>>=shift;
1316

    
1317
            if((unsigned)src_x < width){
1318
                if((unsigned)src_y < height){
1319
                    index= src_x + src_y*stride;
1320
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1321
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1322
                                        + (  src[index+stride  ]*(s-frac_x)
1323
                                           + src[index+stride+1]*   frac_x )*   frac_y
1324
                                        + r)>>(shift*2);
1325
                }else{
1326
                    index= src_x + av_clip(src_y, 0, height)*stride;
1327
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1328
                                          + src[index       +1]*   frac_x )*s
1329
                                        + r)>>(shift*2);
1330
                }
1331
            }else{
1332
                if((unsigned)src_y < height){
1333
                    index= av_clip(src_x, 0, width) + src_y*stride;
1334
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1335
                                           + src[index+stride  ]*   frac_y )*s
1336
                                        + r)>>(shift*2);
1337
                }else{
1338
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1339
                    dst[y*stride + x]=    src[index         ];
1340
                }
1341
            }
1342

    
1343
            vx+= dxx;
1344
            vy+= dyx;
1345
        }
1346
        ox += dxy;
1347
        oy += dyy;
1348
    }
1349
}
1350

    
1351
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352
    switch(width){
1353
    case 2: put_pixels2_c (dst, src, stride, height); break;
1354
    case 4: put_pixels4_c (dst, src, stride, height); break;
1355
    case 8: put_pixels8_c (dst, src, stride, height); break;
1356
    case 16:put_pixels16_c(dst, src, stride, height); break;
1357
    }
1358
}
1359

    
1360
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361
    int i,j;
1362
    for (i=0; i < height; i++) {
1363
      for (j=0; j < width; j++) {
1364
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1365
      }
1366
      src += stride;
1367
      dst += stride;
1368
    }
1369
}
1370

    
1371
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372
    int i,j;
1373
    for (i=0; i < height; i++) {
1374
      for (j=0; j < width; j++) {
1375
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1376
      }
1377
      src += stride;
1378
      dst += stride;
1379
    }
1380
}
1381

    
1382
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383
    int i,j;
1384
    for (i=0; i < height; i++) {
1385
      for (j=0; j < width; j++) {
1386
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1387
      }
1388
      src += stride;
1389
      dst += stride;
1390
    }
1391
}
1392

    
1393
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394
    int i,j;
1395
    for (i=0; i < height; i++) {
1396
      for (j=0; j < width; j++) {
1397
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1398
      }
1399
      src += stride;
1400
      dst += stride;
1401
    }
1402
}
1403

    
1404
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405
    int i,j;
1406
    for (i=0; i < height; i++) {
1407
      for (j=0; j < width; j++) {
1408
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1409
      }
1410
      src += stride;
1411
      dst += stride;
1412
    }
1413
}
1414

    
1415
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416
    int i,j;
1417
    for (i=0; i < height; i++) {
1418
      for (j=0; j < width; j++) {
1419
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1420
      }
1421
      src += stride;
1422
      dst += stride;
1423
    }
1424
}
1425

    
1426
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1427
    int i,j;
1428
    for (i=0; i < height; i++) {
1429
      for (j=0; j < width; j++) {
1430
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1431
      }
1432
      src += stride;
1433
      dst += stride;
1434
    }
1435
}
1436

    
1437
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1438
    int i,j;
1439
    for (i=0; i < height; i++) {
1440
      for (j=0; j < width; j++) {
1441
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1442
      }
1443
      src += stride;
1444
      dst += stride;
1445
    }
1446
}
1447

    
1448
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1449
    switch(width){
1450
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1451
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1452
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1453
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1454
    }
1455
}
1456

    
1457
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458
    int i,j;
1459
    for (i=0; i < height; i++) {
1460
      for (j=0; j < width; j++) {
1461
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1462
      }
1463
      src += stride;
1464
      dst += stride;
1465
    }
1466
}
1467

    
1468
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469
    int i,j;
1470
    for (i=0; i < height; i++) {
1471
      for (j=0; j < width; j++) {
1472
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1473
      }
1474
      src += stride;
1475
      dst += stride;
1476
    }
1477
}
1478

    
1479
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480
    int i,j;
1481
    for (i=0; i < height; i++) {
1482
      for (j=0; j < width; j++) {
1483
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1484
      }
1485
      src += stride;
1486
      dst += stride;
1487
    }
1488
}
1489

    
1490
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491
    int i,j;
1492
    for (i=0; i < height; i++) {
1493
      for (j=0; j < width; j++) {
1494
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1495
      }
1496
      src += stride;
1497
      dst += stride;
1498
    }
1499
}
1500

    
1501
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502
    int i,j;
1503
    for (i=0; i < height; i++) {
1504
      for (j=0; j < width; j++) {
1505
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1506
      }
1507
      src += stride;
1508
      dst += stride;
1509
    }
1510
}
1511

    
1512
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513
    int i,j;
1514
    for (i=0; i < height; i++) {
1515
      for (j=0; j < width; j++) {
1516
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1517
      }
1518
      src += stride;
1519
      dst += stride;
1520
    }
1521
}
1522

    
1523
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1524
    int i,j;
1525
    for (i=0; i < height; i++) {
1526
      for (j=0; j < width; j++) {
1527
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1528
      }
1529
      src += stride;
1530
      dst += stride;
1531
    }
1532
}
1533

    
1534
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1535
    int i,j;
1536
    for (i=0; i < height; i++) {
1537
      for (j=0; j < width; j++) {
1538
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1539
      }
1540
      src += stride;
1541
      dst += stride;
1542
    }
1543
}
1544
#if 0
1545
#define TPEL_WIDTH(width)\
1546
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1547
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1548
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1549
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1550
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1551
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1552
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1554
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1556
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1558
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1560
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1562
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1564
#endif
1565

    
1566
#define H264_CHROMA_MC(OPNAME, OP)\
1567
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1568
    const int A=(8-x)*(8-y);\
1569
    const int B=(  x)*(8-y);\
1570
    const int C=(8-x)*(  y);\
1571
    const int D=(  x)*(  y);\
1572
    int i;\
1573
    \
1574
    assert(x<8 && y<8 && x>=0 && y>=0);\
1575
\
1576
    if(D){\
1577
        for(i=0; i<h; i++){\
1578
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1579
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1580
            dst+= stride;\
1581
            src+= stride;\
1582
        }\
1583
    }else{\
1584
        const int E= B+C;\
1585
        const int step= C ? stride : 1;\
1586
        for(i=0; i<h; i++){\
1587
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1588
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1589
            dst+= stride;\
1590
            src+= stride;\
1591
        }\
1592
    }\
1593
}\
1594
\
1595
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1596
    const int A=(8-x)*(8-y);\
1597
    const int B=(  x)*(8-y);\
1598
    const int C=(8-x)*(  y);\
1599
    const int D=(  x)*(  y);\
1600
    int i;\
1601
    \
1602
    assert(x<8 && y<8 && x>=0 && y>=0);\
1603
\
1604
    if(D){\
1605
        for(i=0; i<h; i++){\
1606
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1607
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1608
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1609
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1610
            dst+= stride;\
1611
            src+= stride;\
1612
        }\
1613
    }else{\
1614
        const int E= B+C;\
1615
        const int step= C ? stride : 1;\
1616
        for(i=0; i<h; i++){\
1617
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1618
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1619
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1620
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1621
            dst+= stride;\
1622
            src+= stride;\
1623
        }\
1624
    }\
1625
}\
1626
\
1627
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1628
    const int A=(8-x)*(8-y);\
1629
    const int B=(  x)*(8-y);\
1630
    const int C=(8-x)*(  y);\
1631
    const int D=(  x)*(  y);\
1632
    int i;\
1633
    \
1634
    assert(x<8 && y<8 && x>=0 && y>=0);\
1635
\
1636
    if(D){\
1637
        for(i=0; i<h; i++){\
1638
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1639
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1640
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1641
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1642
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1643
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1644
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1645
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1646
            dst+= stride;\
1647
            src+= stride;\
1648
        }\
1649
    }else{\
1650
        const int E= B+C;\
1651
        const int step= C ? stride : 1;\
1652
        for(i=0; i<h; i++){\
1653
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1654
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1655
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1656
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1657
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1658
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1659
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1660
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1661
            dst+= stride;\
1662
            src+= stride;\
1663
        }\
1664
    }\
1665
}
1666

    
1667
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1668
#define op_put(a, b) a = (((b) + 32)>>6)
1669

    
1670
H264_CHROMA_MC(put_       , op_put)
1671
H264_CHROMA_MC(avg_       , op_avg)
1672
#undef op_avg
1673
#undef op_put
1674

    
1675
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1676
    const int A=(8-x)*(8-y);
1677
    const int B=(  x)*(8-y);
1678
    const int C=(8-x)*(  y);
1679
    const int D=(  x)*(  y);
1680
    int i;
1681

    
1682
    assert(x<8 && y<8 && x>=0 && y>=0);
1683

    
1684
    for(i=0; i<h; i++)
1685
    {
1686
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1687
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1688
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1689
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1690
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1691
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1692
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1693
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1694
        dst+= stride;
1695
        src+= stride;
1696
    }
1697
}
1698

    
1699
#define QPEL_MC(r, OPNAME, RND, OP) \
1700
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1702
    int i;\
1703
    for(i=0; i<h; i++)\
1704
    {\
1705
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1706
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1707
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1708
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1709
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1710
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1711
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1712
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1713
        dst+=dstStride;\
1714
        src+=srcStride;\
1715
    }\
1716
}\
1717
\
1718
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1719
    const int w=8;\
1720
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1721
    int i;\
1722
    for(i=0; i<w; i++)\
1723
    {\
1724
        const int src0= src[0*srcStride];\
1725
        const int src1= src[1*srcStride];\
1726
        const int src2= src[2*srcStride];\
1727
        const int src3= src[3*srcStride];\
1728
        const int src4= src[4*srcStride];\
1729
        const int src5= src[5*srcStride];\
1730
        const int src6= src[6*srcStride];\
1731
        const int src7= src[7*srcStride];\
1732
        const int src8= src[8*srcStride];\
1733
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1734
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1735
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1736
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1737
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1738
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1739
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1740
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1741
        dst++;\
1742
        src++;\
1743
    }\
1744
}\
1745
\
1746
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1747
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1748
    int i;\
1749
    \
1750
    for(i=0; i<h; i++)\
1751
    {\
1752
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1753
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1754
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1755
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1756
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1757
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1758
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1759
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1760
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1761
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1762
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1763
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1764
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1765
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1766
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1767
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1768
        dst+=dstStride;\
1769
        src+=srcStride;\
1770
    }\
1771
}\
1772
\
1773
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1774
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1775
    int i;\
1776
    const int w=16;\
1777
    for(i=0; i<w; i++)\
1778
    {\
1779
        const int src0= src[0*srcStride];\
1780
        const int src1= src[1*srcStride];\
1781
        const int src2= src[2*srcStride];\
1782
        const int src3= src[3*srcStride];\
1783
        const int src4= src[4*srcStride];\
1784
        const int src5= src[5*srcStride];\
1785
        const int src6= src[6*srcStride];\
1786
        const int src7= src[7*srcStride];\
1787
        const int src8= src[8*srcStride];\
1788
        const int src9= src[9*srcStride];\
1789
        const int src10= src[10*srcStride];\
1790
        const int src11= src[11*srcStride];\
1791
        const int src12= src[12*srcStride];\
1792
        const int src13= src[13*srcStride];\
1793
        const int src14= src[14*srcStride];\
1794
        const int src15= src[15*srcStride];\
1795
        const int src16= src[16*srcStride];\
1796
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1797
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1798
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1799
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1800
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1801
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1802
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1803
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1804
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1805
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1806
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1807
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1808
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1809
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1810
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1811
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1812
        dst++;\
1813
        src++;\
1814
    }\
1815
}\
1816
\
1817
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1818
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1819
}\
1820
\
1821
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1822
    uint8_t half[64];\
1823
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1824
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1825
}\
1826
\
1827
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1828
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1829
}\
1830
\
1831
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1832
    uint8_t half[64];\
1833
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1834
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1835
}\
1836
\
1837
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1838
    uint8_t full[16*9];\
1839
    uint8_t half[64];\
1840
    copy_block9(full, src, 16, stride, 9);\
1841
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1842
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1843
}\
1844
\
1845
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1846
    uint8_t full[16*9];\
1847
    copy_block9(full, src, 16, stride, 9);\
1848
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1849
}\
1850
\
1851
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1852
    uint8_t full[16*9];\
1853
    uint8_t half[64];\
1854
    copy_block9(full, src, 16, stride, 9);\
1855
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1856
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1857
}\
1858
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1859
    uint8_t full[16*9];\
1860
    uint8_t halfH[72];\
1861
    uint8_t halfV[64];\
1862
    uint8_t halfHV[64];\
1863
    copy_block9(full, src, 16, stride, 9);\
1864
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1865
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1866
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1867
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1868
}\
1869
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1870
    uint8_t full[16*9];\
1871
    uint8_t halfH[72];\
1872
    uint8_t halfHV[64];\
1873
    copy_block9(full, src, 16, stride, 9);\
1874
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1875
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1876
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1877
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1878
}\
1879
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1880
    uint8_t full[16*9];\
1881
    uint8_t halfH[72];\
1882
    uint8_t halfV[64];\
1883
    uint8_t halfHV[64];\
1884
    copy_block9(full, src, 16, stride, 9);\
1885
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1886
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1887
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1889
}\
1890
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1891
    uint8_t full[16*9];\
1892
    uint8_t halfH[72];\
1893
    uint8_t halfHV[64];\
1894
    copy_block9(full, src, 16, stride, 9);\
1895
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1896
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1897
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1898
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1899
}\
1900
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1901
    uint8_t full[16*9];\
1902
    uint8_t halfH[72];\
1903
    uint8_t halfV[64];\
1904
    uint8_t halfHV[64];\
1905
    copy_block9(full, src, 16, stride, 9);\
1906
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1907
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1908
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1909
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1910
}\
1911
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1912
    uint8_t full[16*9];\
1913
    uint8_t halfH[72];\
1914
    uint8_t halfHV[64];\
1915
    copy_block9(full, src, 16, stride, 9);\
1916
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1917
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1918
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1919
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1920
}\
1921
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1922
    uint8_t full[16*9];\
1923
    uint8_t halfH[72];\
1924
    uint8_t halfV[64];\
1925
    uint8_t halfHV[64];\
1926
    copy_block9(full, src, 16, stride, 9);\
1927
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1928
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1929
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1930
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1931
}\
1932
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1933
    uint8_t full[16*9];\
1934
    uint8_t halfH[72];\
1935
    uint8_t halfHV[64];\
1936
    copy_block9(full, src, 16, stride, 9);\
1937
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1938
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1939
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1940
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1941
}\
1942
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1943
    uint8_t halfH[72];\
1944
    uint8_t halfHV[64];\
1945
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1946
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1948
}\
1949
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1950
    uint8_t halfH[72];\
1951
    uint8_t halfHV[64];\
1952
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1955
}\
1956
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t full[16*9];\
1958
    uint8_t halfH[72];\
1959
    uint8_t halfV[64];\
1960
    uint8_t halfHV[64];\
1961
    copy_block9(full, src, 16, stride, 9);\
1962
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1964
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1966
}\
1967
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1968
    uint8_t full[16*9];\
1969
    uint8_t halfH[72];\
1970
    copy_block9(full, src, 16, stride, 9);\
1971
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1973
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1974
}\
1975
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t full[16*9];\
1977
    uint8_t halfH[72];\
1978
    uint8_t halfV[64];\
1979
    uint8_t halfHV[64];\
1980
    copy_block9(full, src, 16, stride, 9);\
1981
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1982
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1983
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1984
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1985
}\
1986
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1987
    uint8_t full[16*9];\
1988
    uint8_t halfH[72];\
1989
    copy_block9(full, src, 16, stride, 9);\
1990
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1991
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1992
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1993
}\
1994
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1995
    uint8_t halfH[72];\
1996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1997
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1998
}\
1999
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2001
}\
2002
\
2003
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004
    uint8_t half[256];\
2005
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2006
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2007
}\
2008
\
2009
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2011
}\
2012
\
2013
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014
    uint8_t half[256];\
2015
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2016
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2017
}\
2018
\
2019
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020
    uint8_t full[24*17];\
2021
    uint8_t half[256];\
2022
    copy_block17(full, src, 24, stride, 17);\
2023
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2024
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2025
}\
2026
\
2027
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2028
    uint8_t full[24*17];\
2029
    copy_block17(full, src, 24, stride, 17);\
2030
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2031
}\
2032
\
2033
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2034
    uint8_t full[24*17];\
2035
    uint8_t half[256];\
2036
    copy_block17(full, src, 24, stride, 17);\
2037
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2038
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2039
}\
2040
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2041
    uint8_t full[24*17];\
2042
    uint8_t halfH[272];\
2043
    uint8_t halfV[256];\
2044
    uint8_t halfHV[256];\
2045
    copy_block17(full, src, 24, stride, 17);\
2046
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2047
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2048
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2049
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2050
}\
2051
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2052
    uint8_t full[24*17];\
2053
    uint8_t halfH[272];\
2054
    uint8_t halfHV[256];\
2055
    copy_block17(full, src, 24, stride, 17);\
2056
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2057
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2058
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2059
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2060
}\
2061
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2062
    uint8_t full[24*17];\
2063
    uint8_t halfH[272];\
2064
    uint8_t halfV[256];\
2065
    uint8_t halfHV[256];\
2066
    copy_block17(full, src, 24, stride, 17);\
2067
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2068
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2069
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2070
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2071
}\
2072
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2073
    uint8_t full[24*17];\
2074
    uint8_t halfH[272];\
2075
    uint8_t halfHV[256];\
2076
    copy_block17(full, src, 24, stride, 17);\
2077
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2078
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2079
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2080
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2081
}\
2082
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2083
    uint8_t full[24*17];\
2084
    uint8_t halfH[272];\
2085
    uint8_t halfV[256];\
2086
    uint8_t halfHV[256];\
2087
    copy_block17(full, src, 24, stride, 17);\
2088
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2089
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2090
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2091
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2092
}\
2093
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2094
    uint8_t full[24*17];\
2095
    uint8_t halfH[272];\
2096
    uint8_t halfHV[256];\
2097
    copy_block17(full, src, 24, stride, 17);\
2098
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2100
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2101
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2102
}\
2103
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2104
    uint8_t full[24*17];\
2105
    uint8_t halfH[272];\
2106
    uint8_t halfV[256];\
2107
    uint8_t halfHV[256];\
2108
    copy_block17(full, src, 24, stride, 17);\
2109
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2110
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2111
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2112
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2113
}\
2114
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2115
    uint8_t full[24*17];\
2116
    uint8_t halfH[272];\
2117
    uint8_t halfHV[256];\
2118
    copy_block17(full, src, 24, stride, 17);\
2119
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2120
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2121
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2122
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2123
}\
2124
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2125
    uint8_t halfH[272];\
2126
    uint8_t halfHV[256];\
2127
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2128
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2130
}\
2131
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2132
    uint8_t halfH[272];\
2133
    uint8_t halfHV[256];\
2134
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2137
}\
2138
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139
    uint8_t full[24*17];\
2140
    uint8_t halfH[272];\
2141
    uint8_t halfV[256];\
2142
    uint8_t halfHV[256];\
2143
    copy_block17(full, src, 24, stride, 17);\
2144
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2146
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2148
}\
2149
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2150
    uint8_t full[24*17];\
2151
    uint8_t halfH[272];\
2152
    copy_block17(full, src, 24, stride, 17);\
2153
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2155
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2156
}\
2157
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2158
    uint8_t full[24*17];\
2159
    uint8_t halfH[272];\
2160
    uint8_t halfV[256];\
2161
    uint8_t halfHV[256];\
2162
    copy_block17(full, src, 24, stride, 17);\
2163
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2164
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2165
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2166
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2167
}\
2168
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2169
    uint8_t full[24*17];\
2170
    uint8_t halfH[272];\
2171
    copy_block17(full, src, 24, stride, 17);\
2172
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2173
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2174
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2175
}\
2176
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2177
    uint8_t halfH[272];\
2178
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2179
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2180
}
2181

    
2182
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2183
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2184
#define op_put(a, b) a = cm[((b) + 16)>>5]
2185
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2186

    
2187
QPEL_MC(0, put_       , _       , op_put)
2188
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2189
QPEL_MC(0, avg_       , _       , op_avg)
2190
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2191
#undef op_avg
2192
#undef op_avg_no_rnd
2193
#undef op_put
2194
#undef op_put_no_rnd
2195

    
2196
#if 1
2197
#define H264_LOWPASS(OPNAME, OP, OP2) \
2198
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2199
    const int h=2;\
2200
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2201
    int i;\
2202
    for(i=0; i<h; i++)\
2203
    {\
2204
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2205
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2206
        dst+=dstStride;\
2207
        src+=srcStride;\
2208
    }\
2209
}\
2210
\
2211
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2212
    const int w=2;\
2213
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2214
    int i;\
2215
    for(i=0; i<w; i++)\
2216
    {\
2217
        const int srcB= src[-2*srcStride];\
2218
        const int srcA= src[-1*srcStride];\
2219
        const int src0= src[0 *srcStride];\
2220
        const int src1= src[1 *srcStride];\
2221
        const int src2= src[2 *srcStride];\
2222
        const int src3= src[3 *srcStride];\
2223
        const int src4= src[4 *srcStride];\
2224
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2225
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2226
        dst++;\
2227
        src++;\
2228
    }\
2229
}\
2230
\
2231
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2232
    const int h=2;\
2233
    const int w=2;\
2234
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2235
    int i;\
2236
    src -= 2*srcStride;\
2237
    for(i=0; i<h+5; i++)\
2238
    {\
2239
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2240
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2241
        tmp+=tmpStride;\
2242
        src+=srcStride;\
2243
    }\
2244
    tmp -= tmpStride*(h+5-2);\
2245
    for(i=0; i<w; i++)\
2246
    {\
2247
        const int tmpB= tmp[-2*tmpStride];\
2248
        const int tmpA= tmp[-1*tmpStride];\
2249
        const int tmp0= tmp[0 *tmpStride];\
2250
        const int tmp1= tmp[1 *tmpStride];\
2251
        const int tmp2= tmp[2 *tmpStride];\
2252
        const int tmp3= tmp[3 *tmpStride];\
2253
        const int tmp4= tmp[4 *tmpStride];\
2254
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2255
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2256
        dst++;\
2257
        tmp++;\
2258
    }\
2259
}\
2260
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261
    const int h=4;\
2262
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2263
    int i;\
2264
    for(i=0; i<h; i++)\
2265
    {\
2266
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2267
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2268
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2269
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2270
        dst+=dstStride;\
2271
        src+=srcStride;\
2272
    }\
2273
}\
2274
\
2275
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2276
    const int w=4;\
2277
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2278
    int i;\
2279
    for(i=0; i<w; i++)\
2280
    {\
2281
        const int srcB= src[-2*srcStride];\
2282
        const int srcA= src[-1*srcStride];\
2283
        const int src0= src[0 *srcStride];\
2284
        const int src1= src[1 *srcStride];\
2285
        const int src2= src[2 *srcStride];\
2286
        const int src3= src[3 *srcStride];\
2287
        const int src4= src[4 *srcStride];\
2288
        const int src5= src[5 *srcStride];\
2289
        const int src6= src[6 *srcStride];\
2290
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2291
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2292
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2293
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2294
        dst++;\
2295
        src++;\
2296
    }\
2297
}\
2298
\
2299
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2300
    const int h=4;\
2301
    const int w=4;\
2302
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2303
    int i;\
2304
    src -= 2*srcStride;\
2305
    for(i=0; i<h+5; i++)\
2306
    {\
2307
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2308
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2309
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2310
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2311
        tmp+=tmpStride;\
2312
        src+=srcStride;\
2313
    }\
2314
    tmp -= tmpStride*(h+5-2);\
2315
    for(i=0; i<w; i++)\
2316
    {\
2317
        const int tmpB= tmp[-2*tmpStride];\
2318
        const int tmpA= tmp[-1*tmpStride];\
2319
        const int tmp0= tmp[0 *tmpStride];\
2320
        const int tmp1= tmp[1 *tmpStride];\
2321
        const int tmp2= tmp[2 *tmpStride];\
2322
        const int tmp3= tmp[3 *tmpStride];\
2323
        const int tmp4= tmp[4 *tmpStride];\
2324
        const int tmp5= tmp[5 *tmpStride];\
2325
        const int tmp6= tmp[6 *tmpStride];\
2326
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2327
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2328
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2329
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2330
        dst++;\
2331
        tmp++;\
2332
    }\
2333
}\
2334
\
2335
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2336
    const int h=8;\
2337
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338
    int i;\
2339
    for(i=0; i<h; i++)\
2340
    {\
2341
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2342
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2343
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2344
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2345
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2346
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2347
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2348
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2349
        dst+=dstStride;\
2350
        src+=srcStride;\
2351
    }\
2352
}\
2353
\
2354
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2355
    const int w=8;\
2356
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2357
    int i;\
2358
    for(i=0; i<w; i++)\
2359
    {\
2360
        const int srcB= src[-2*srcStride];\
2361
        const int srcA= src[-1*srcStride];\
2362
        const int src0= src[0 *srcStride];\
2363
        const int src1= src[1 *srcStride];\
2364
        const int src2= src[2 *srcStride];\
2365
        const int src3= src[3 *srcStride];\
2366
        const int src4= src[4 *srcStride];\
2367
        const int src5= src[5 *srcStride];\
2368
        const int src6= src[6 *srcStride];\
2369
        const int src7= src[7 *srcStride];\
2370
        const int src8= src[8 *srcStride];\
2371
        const int src9= src[9 *srcStride];\
2372
        const int src10=src[10*srcStride];\
2373
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2374
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2375
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2376
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2377
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2378
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2379
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2380
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2381
        dst++;\
2382
        src++;\
2383
    }\
2384
}\
2385
\
2386
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2387
    const int h=8;\
2388
    const int w=8;\
2389
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2390
    int i;\
2391
    src -= 2*srcStride;\
2392
    for(i=0; i<h+5; i++)\
2393
    {\
2394
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2395
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2396
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2397
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2398
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2399
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2400
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2401
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2402
        tmp+=tmpStride;\
2403
        src+=srcStride;\
2404
    }\
2405
    tmp -= tmpStride*(h+5-2);\
2406
    for(i=0; i<w; i++)\
2407
    {\
2408
        const int tmpB= tmp[-2*tmpStride];\
2409
        const int tmpA= tmp[-1*tmpStride];\
2410
        const int tmp0= tmp[0 *tmpStride];\
2411
        const int tmp1= tmp[1 *tmpStride];\
2412
        const int tmp2= tmp[2 *tmpStride];\
2413
        const int tmp3= tmp[3 *tmpStride];\
2414
        const int tmp4= tmp[4 *tmpStride];\
2415
        const int tmp5= tmp[5 *tmpStride];\
2416
        const int tmp6= tmp[6 *tmpStride];\
2417
        const int tmp7= tmp[7 *tmpStride];\
2418
        const int tmp8= tmp[8 *tmpStride];\
2419
        const int tmp9= tmp[9 *tmpStride];\
2420
        const int tmp10=tmp[10*tmpStride];\
2421
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2422
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2423
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2424
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2425
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2426
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2427
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2428
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2429
        dst++;\
2430
        tmp++;\
2431
    }\
2432
}\
2433
\
2434
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2435
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2436
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2437
    src += 8*srcStride;\
2438
    dst += 8*dstStride;\
2439
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2440
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2441
}\
2442
\
2443
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2444
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2445
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2446
    src += 8*srcStride;\
2447
    dst += 8*dstStride;\
2448
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2449
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2450
}\
2451
\
2452
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2453
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2454
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2455
    src += 8*srcStride;\
2456
    dst += 8*dstStride;\
2457
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2458
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2459
}\
2460

    
2461
#define H264_MC(OPNAME, SIZE) \
2462
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2463
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2464
}\
2465
\
2466
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2467
    uint8_t half[SIZE*SIZE];\
2468
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2469
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2474
}\
2475
\
2476
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2477
    uint8_t half[SIZE*SIZE];\
2478
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2479
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2480
}\
2481
\
2482
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2483
    uint8_t full[SIZE*(SIZE+5)];\
2484
    uint8_t * const full_mid= full + SIZE*2;\
2485
    uint8_t half[SIZE*SIZE];\
2486
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2487
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2488
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2489
}\
2490
\
2491
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2492
    uint8_t full[SIZE*(SIZE+5)];\
2493
    uint8_t * const full_mid= full + SIZE*2;\
2494
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2495
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2496
}\
2497
\
2498
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2499
    uint8_t full[SIZE*(SIZE+5)];\
2500
    uint8_t * const full_mid= full + SIZE*2;\
2501
    uint8_t half[SIZE*SIZE];\
2502
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2503
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2504
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2505
}\
2506
\
2507
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2508
    uint8_t full[SIZE*(SIZE+5)];\
2509
    uint8_t * const full_mid= full + SIZE*2;\
2510
    uint8_t halfH[SIZE*SIZE];\
2511
    uint8_t halfV[SIZE*SIZE];\
2512
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2513
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2514
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2515
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2516
}\
2517
\
2518
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2519
    uint8_t full[SIZE*(SIZE+5)];\
2520
    uint8_t * const full_mid= full + SIZE*2;\
2521
    uint8_t halfH[SIZE*SIZE];\
2522
    uint8_t halfV[SIZE*SIZE];\
2523
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2524
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2525
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2526
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2527
}\
2528
\
2529
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2530
    uint8_t full[SIZE*(SIZE+5)];\
2531
    uint8_t * const full_mid= full + SIZE*2;\
2532
    uint8_t halfH[SIZE*SIZE];\
2533
    uint8_t halfV[SIZE*SIZE];\
2534
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2535
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2536
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2537
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2538
}\
2539
\
2540
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2541
    uint8_t full[SIZE*(SIZE+5)];\
2542
    uint8_t * const full_mid= full + SIZE*2;\
2543
    uint8_t halfH[SIZE*SIZE];\
2544
    uint8_t halfV[SIZE*SIZE];\
2545
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2546
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2547
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2548
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2549
}\
2550
\
2551
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2552
    int16_t tmp[SIZE*(SIZE+5)];\
2553
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2554
}\
2555
\
2556
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2557
    int16_t tmp[SIZE*(SIZE+5)];\
2558
    uint8_t halfH[SIZE*SIZE];\
2559
    uint8_t halfHV[SIZE*SIZE];\
2560
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2561
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2562
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2563
}\
2564
\
2565
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2566
    int16_t tmp[SIZE*(SIZE+5)];\
2567
    uint8_t halfH[SIZE*SIZE];\
2568
    uint8_t halfHV[SIZE*SIZE];\
2569
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2570
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2571
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2572
}\
2573
\
2574
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2575
    uint8_t full[SIZE*(SIZE+5)];\
2576
    uint8_t * const full_mid= full + SIZE*2;\
2577
    int16_t tmp[SIZE*(SIZE+5)];\
2578
    uint8_t halfV[SIZE*SIZE];\
2579
    uint8_t halfHV[SIZE*SIZE];\
2580
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2581
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2583
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2584
}\
2585
\
2586
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2587
    uint8_t full[SIZE*(SIZE+5)];\
2588
    uint8_t * const full_mid= full + SIZE*2;\
2589
    int16_t tmp[SIZE*(SIZE+5)];\
2590
    uint8_t halfV[SIZE*SIZE];\
2591
    uint8_t halfHV[SIZE*SIZE];\
2592
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2593
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2594
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2595
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2596
}\
2597

    
2598
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2599
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2600
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2601
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2602
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2603

    
2604
H264_LOWPASS(put_       , op_put, op2_put)
2605
H264_LOWPASS(avg_       , op_avg, op2_avg)
2606
H264_MC(put_, 2)
2607
H264_MC(put_, 4)
2608
H264_MC(put_, 8)
2609
H264_MC(put_, 16)
2610
H264_MC(avg_, 4)
2611
H264_MC(avg_, 8)
2612
H264_MC(avg_, 16)
2613

    
2614
#undef op_avg
2615
#undef op_put
2616
#undef op2_avg
2617
#undef op2_put
2618
#endif
2619

    
2620
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2621
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2622
#define H264_WEIGHT(W,H) \
2623
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2624
    int y; \
2625
    offset <<= log2_denom; \
2626
    if(log2_denom) offset += 1<<(log2_denom-1); \
2627
    for(y=0; y<H; y++, block += stride){ \
2628
        op_scale1(0); \
2629
        op_scale1(1); \
2630
        if(W==2) continue; \
2631
        op_scale1(2); \
2632
        op_scale1(3); \
2633
        if(W==4) continue; \
2634
        op_scale1(4); \
2635
        op_scale1(5); \
2636
        op_scale1(6); \
2637
        op_scale1(7); \
2638
        if(W==8) continue; \
2639
        op_scale1(8); \
2640
        op_scale1(9); \
2641
        op_scale1(10); \
2642
        op_scale1(11); \
2643
        op_scale1(12); \
2644
        op_scale1(13); \
2645
        op_scale1(14); \
2646
        op_scale1(15); \
2647
    } \
2648
} \
2649
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2650
    int y; \
2651
    offset = ((offset + 1) | 1) << log2_denom; \
2652
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2653
        op_scale2(0); \
2654
        op_scale2(1); \
2655
        if(W==2) continue; \
2656
        op_scale2(2); \
2657
        op_scale2(3); \
2658
        if(W==4) continue; \
2659
        op_scale2(4); \
2660
        op_scale2(5); \
2661
        op_scale2(6); \
2662
        op_scale2(7); \
2663
        if(W==8) continue; \
2664
        op_scale2(8); \
2665
        op_scale2(9); \
2666
        op_scale2(10); \
2667
        op_scale2(11); \
2668
        op_scale2(12); \
2669
        op_scale2(13); \
2670
        op_scale2(14); \
2671
        op_scale2(15); \
2672
    } \
2673
}
2674

    
2675
H264_WEIGHT(16,16)
2676
H264_WEIGHT(16,8)
2677
H264_WEIGHT(8,16)
2678
H264_WEIGHT(8,8)
2679
H264_WEIGHT(8,4)
2680
H264_WEIGHT(4,8)
2681
H264_WEIGHT(4,4)
2682
H264_WEIGHT(4,2)
2683
H264_WEIGHT(2,4)
2684
H264_WEIGHT(2,2)
2685

    
2686
#undef op_scale1
2687
#undef op_scale2
2688
#undef H264_WEIGHT
2689

    
2690
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2691
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2692
    int i;
2693

    
2694
    for(i=0; i<h; i++){
2695
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2696
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2697
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2698
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2699
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2700
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2701
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2702
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2703
        dst+=dstStride;
2704
        src+=srcStride;
2705
    }
2706
}
2707

    
2708
#ifdef CONFIG_CAVS_DECODER
2709
/* AVS specific */
2710
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2711

    
2712
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2713
    put_pixels8_c(dst, src, stride, 8);
2714
}
2715
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2716
    avg_pixels8_c(dst, src, stride, 8);
2717
}
2718
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719
    put_pixels16_c(dst, src, stride, 16);
2720
}
2721
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722
    avg_pixels16_c(dst, src, stride, 16);
2723
}
2724
#endif /* CONFIG_CAVS_DECODER */
2725

    
2726
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2727
/* VC-1 specific */
2728
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2729

    
2730
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2731
    put_pixels8_c(dst, src, stride, 8);
2732
}
2733
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2734

    
2735
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2736

    
2737
/* H264 specific */
2738
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2739

    
2740
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2741
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2742
    int i;
2743

    
2744
    for(i=0; i<w; i++){
2745
        const int src_1= src[ -srcStride];
2746
        const int src0 = src[0          ];
2747
        const int src1 = src[  srcStride];
2748
        const int src2 = src[2*srcStride];
2749
        const int src3 = src[3*srcStride];
2750
        const int src4 = src[4*srcStride];
2751
        const int src5 = src[5*srcStride];
2752
        const int src6 = src[6*srcStride];
2753
        const int src7 = src[7*srcStride];
2754
        const int src8 = src[8*srcStride];
2755
        const int src9 = src[9*srcStride];
2756
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2757
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2758
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2759
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2760
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2761
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2762
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2763
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2764
        src++;
2765
        dst++;
2766
    }
2767
}
2768

    
2769
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2770
    put_pixels8_c(dst, src, stride, 8);
2771
}
2772

    
2773
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2774
    uint8_t half[64];
2775
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2776
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2777
}
2778

    
2779
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2780
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2781
}
2782

    
2783
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2784
    uint8_t half[64];
2785
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2786
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2787
}
2788

    
2789
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2790
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2791
}
2792

    
2793
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2794
    uint8_t halfH[88];
2795
    uint8_t halfV[64];
2796
    uint8_t halfHV[64];
2797
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2798
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2799
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2800
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2801
}
2802
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2803
    uint8_t halfH[88];
2804
    uint8_t halfV[64];
2805
    uint8_t halfHV[64];
2806
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2807
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2808
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2809
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2810
}
2811
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2812
    uint8_t halfH[88];
2813
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2814
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2815
}
2816

    
2817
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2818
    if(ENABLE_ANY_H263) {
2819
    int x;
2820
    const int strength= ff_h263_loop_filter_strength[qscale];
2821

    
2822
    for(x=0; x<8; x++){
2823
        int d1, d2, ad1;
2824
        int p0= src[x-2*stride];
2825
        int p1= src[x-1*stride];
2826
        int p2= src[x+0*stride];
2827
        int p3= src[x+1*stride];
2828
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2829

    
2830
        if     (d<-2*strength) d1= 0;
2831
        else if(d<-  strength) d1=-2*strength - d;
2832
        else if(d<   strength) d1= d;
2833
        else if(d< 2*strength) d1= 2*strength - d;
2834
        else                   d1= 0;
2835

    
2836
        p1 += d1;
2837
        p2 -= d1;
2838
        if(p1&256) p1= ~(p1>>31);
2839
        if(p2&256) p2= ~(p2>>31);
2840

    
2841
        src[x-1*stride] = p1;
2842
        src[x+0*stride] = p2;
2843

    
2844
        ad1= FFABS(d1)>>1;
2845

    
2846
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2847

    
2848
        src[x-2*stride] = p0 - d2;
2849
        src[x+  stride] = p3 + d2;
2850
    }
2851
    }
2852
}
2853

    
2854
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2855
    if(ENABLE_ANY_H263) {
2856
    int y;
2857
    const int strength= ff_h263_loop_filter_strength[qscale];
2858

    
2859
    for(y=0; y<8; y++){
2860
        int d1, d2, ad1;
2861
        int p0= src[y*stride-2];
2862
        int p1= src[y*stride-1];
2863
        int p2= src[y*stride+0];
2864
        int p3= src[y*stride+1];
2865
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2866

    
2867
        if     (d<-2*strength) d1= 0;
2868
        else if(d<-  strength) d1=-2*strength - d;
2869
        else if(d<   strength) d1= d;
2870
        else if(d< 2*strength) d1= 2*strength - d;
2871
        else                   d1= 0;
2872

    
2873
        p1 += d1;
2874
        p2 -= d1;
2875
        if(p1&256) p1= ~(p1>>31);
2876
        if(p2&256) p2= ~(p2>>31);
2877

    
2878
        src[y*stride-1] = p1;
2879
        src[y*stride+0] = p2;
2880

    
2881
        ad1= FFABS(d1)>>1;
2882

    
2883
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2884

    
2885
        src[y*stride-2] = p0 - d2;
2886
        src[y*stride+1] = p3 + d2;
2887
    }
2888
    }
2889
}
2890

    
2891
static void h261_loop_filter_c(uint8_t *src, int stride){
2892
    int x,y,xy,yz;
2893
    int temp[64];
2894

    
2895
    for(x=0; x<8; x++){
2896
        temp[x      ] = 4*src[x           ];
2897
        temp[x + 7*8] = 4*src[x + 7*stride];
2898
    }
2899
    for(y=1; y<7; y++){
2900
        for(x=0; x<8; x++){
2901
            xy = y * stride + x;
2902
            yz = y * 8 + x;
2903
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2904
        }
2905
    }
2906

    
2907
    for(y=0; y<8; y++){
2908
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2909
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2910
        for(x=1; x<7; x++){
2911
            xy = y * stride + x;
2912
            yz = y * 8 + x;
2913
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2914
        }
2915
    }
2916
}
2917

    
2918
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2919
{
2920
    int i, d;
2921
    for( i = 0; i < 4; i++ ) {
2922
        if( tc0[i] < 0 ) {
2923
            pix += 4*ystride;
2924
            continue;
2925
        }
2926
        for( d = 0; d < 4; d++ ) {
2927
            const int p0 = pix[-1*xstride];
2928
            const int p1 = pix[-2*xstride];
2929
            const int p2 = pix[-3*xstride];
2930
            const int q0 = pix[0];
2931
            const int q1 = pix[1*xstride];
2932
            const int q2 = pix[2*xstride];
2933

    
2934
            if( FFABS( p0 - q0 ) < alpha &&
2935
                FFABS( p1 - p0 ) < beta &&
2936
                FFABS( q1 - q0 ) < beta ) {
2937

    
2938
                int tc = tc0[i];
2939
                int i_delta;
2940

    
2941
                if( FFABS( p2 - p0 ) < beta ) {
2942
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2943
                    tc++;
2944
                }
2945
                if( FFABS( q2 - q0 ) < beta ) {
2946
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2947
                    tc++;
2948
                }
2949

    
2950
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2951
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2952
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2953
            }
2954
            pix += ystride;
2955
        }
2956
    }
2957
}
2958
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2959
{
2960
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2961
}
2962
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2963
{
2964
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2965
}
2966

    
2967
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2968
{
2969
    int i, d;
2970
    for( i = 0; i < 4; i++ ) {
2971
        const int tc = tc0[i];
2972
        if( tc <= 0 ) {
2973
            pix += 2*ystride;
2974
            continue;
2975
        }
2976
        for( d = 0; d < 2; d++ ) {
2977
            const int p0 = pix[-1*xstride];
2978
            const int p1 = pix[-2*xstride];
2979
            const int q0 = pix[0];
2980
            const int q1 = pix[1*xstride];
2981

    
2982
            if( FFABS( p0 - q0 ) < alpha &&
2983
                FFABS( p1 - p0 ) < beta &&
2984
                FFABS( q1 - q0 ) < beta ) {
2985

    
2986
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2987

    
2988
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2989
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2990
            }
2991
            pix += ystride;
2992
        }
2993
    }
2994
}
2995
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2996
{
2997
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2998
}
2999
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3000
{
3001
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3002
}
3003

    
3004
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3005
{
3006
    int d;
3007
    for( d = 0; d < 8; d++ ) {
3008
        const int p0 = pix[-1*xstride];
3009
        const int p1 = pix[-2*xstride];
3010
        const int q0 = pix[0];
3011
        const int q1 = pix[1*xstride];
3012

    
3013
        if( FFABS( p0 - q0 ) < alpha &&
3014
            FFABS( p1 - p0 ) < beta &&
3015
            FFABS( q1 - q0 ) < beta ) {
3016

    
3017
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3018
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3019
        }
3020
        pix += ystride;
3021
    }
3022
}
3023
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3024
{
3025
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3026
}
3027
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3028
{
3029
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3030
}
3031

    
3032
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3033
{
3034
    int s, i;
3035

    
3036
    s = 0;
3037
    for(i=0;i<h;i++) {
3038
        s += abs(pix1[0] - pix2[0]);
3039
        s += abs(pix1[1] - pix2[1]);
3040
        s += abs(pix1[2] - pix2[2]);
3041
        s += abs(pix1[3] - pix2[3]);
3042
        s += abs(pix1[4] - pix2[4]);
3043
        s += abs(pix1[5] - pix2[5]);
3044
        s += abs(pix1[6] - pix2[6]);
3045
        s += abs(pix1[7] - pix2[7]);
3046
        s += abs(pix1[8] - pix2[8]);
3047
        s += abs(pix1[9] - pix2[9]);
3048
        s += abs(pix1[10] - pix2[10]);
3049
        s += abs(pix1[11] - pix2[11]);
3050
        s += abs(pix1[12] - pix2[12]);
3051
        s += abs(pix1[13] - pix2[13]);
3052
        s += abs(pix1[14] - pix2[14]);
3053
        s += abs(pix1[15] - pix2[15]);
3054
        pix1 += line_size;
3055
        pix2 += line_size;
3056
    }
3057
    return s;
3058
}
3059

    
3060
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3061
{
3062
    int s, i;
3063

    
3064
    s = 0;
3065
    for(i=0;i<h;i++) {
3066
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3067
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3068
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3069
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3070
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3071
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3072
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3073
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3074
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3075
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3076
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3077
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3078
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3079
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3080
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3081
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3082
        pix1 += line_size;
3083
        pix2 += line_size;
3084
    }
3085
    return s;
3086
}
3087

    
3088
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3089
{
3090
    int s, i;
3091
    uint8_t *pix3 = pix2 + line_size;
3092

    
3093
    s = 0;
3094
    for(i=0;i<h;i++) {
3095
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3096
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3097
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3098
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3099
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3100
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3101
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3102
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3103
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3104
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3105
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3106
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3107
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3108
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3109
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3110
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3111
        pix1 += line_size;
3112
        pix2 += line_size;
3113
        pix3 += line_size;
3114
    }
3115
    return s;
3116
}
3117

    
3118
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3119
{
3120
    int s, i;
3121
    uint8_t *pix3 = pix2 + line_size;
3122

    
3123
    s = 0;
3124
    for(i=0;i<h;i++) {
3125
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3126
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3127
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3128
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3129
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3130
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3131
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3132
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3133
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3134
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3135
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3136
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3137
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3138
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3139
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3140
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3141
        pix1 += line_size;
3142
        pix2 += line_size;
3143
        pix3 += line_size;
3144
    }
3145
    return s;
3146
}
3147

    
3148
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3149
{
3150
    int s, i;
3151

    
3152
    s = 0;
3153
    for(i=0;i<h;i++) {
3154
        s += abs(pix1[0] - pix2[0]);
3155
        s += abs(pix1[1] - pix2[1]);
3156
        s += abs(pix1[2] - pix2[2]);
3157
        s += abs(pix1[3] - pix2[3]);
3158
        s += abs(pix1[4] - pix2[4]);
3159
        s += abs(pix1[5] - pix2[5]);
3160
        s += abs(pix1[6] - pix2[6]);
3161
        s += abs(pix1[7] - pix2[7]);
3162
        pix1 += line_size;
3163
        pix2 += line_size;
3164
    }
3165
    return s;
3166
}
3167

    
3168
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3169
{
3170
    int s, i;
3171

    
3172
    s = 0;
3173
    for(i=0;i<h;i++) {
3174
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3175
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3176
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3177
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3178
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3179
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3180
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3181
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3182
        pix1 += line_size;
3183
        pix2 += line_size;
3184
    }
3185
    return s;
3186
}
3187

    
3188
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3189
{
3190
    int s, i;
3191
    uint8_t *pix3 = pix2 + line_size;
3192

    
3193
    s = 0;
3194
    for(i=0;i<h;i++) {
3195
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3196
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3197
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3198
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3199
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3200
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3201
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3202
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3203
        pix1 += line_size;
3204
        pix2 += line_size;
3205
        pix3 += line_size;
3206
    }
3207
    return s;
3208
}
3209

    
3210
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3211
{
3212
    int s, i;
3213
    uint8_t *pix3 = pix2 + line_size;
3214

    
3215
    s = 0;
3216
    for(i=0;i<h;i++) {
3217
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3218
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3219
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3220
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3221
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3222
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3223
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3224
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3225
        pix1 += line_size;
3226
        pix2 += line_size;
3227
        pix3 += line_size;
3228
    }
3229
    return s;
3230
}
3231

    
3232
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3233
    MpegEncContext *c = v;
3234
    int score1=0;
3235
    int score2=0;
3236
    int x,y;
3237

    
3238
    for(y=0; y<h; y++){
3239
        for(x=0; x<16; x++){
3240
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3241
        }
3242
        if(y+1<h){
3243
            for(x=0; x<15; x++){
3244
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3245
                             - s1[x+1] + s1[x+1+stride])
3246
                        -FFABS(  s2[x  ] - s2[x  +stride]
3247
                             - s2[x+1] + s2[x+1+stride]);
3248
            }
3249
        }
3250
        s1+= stride;
3251
        s2+= stride;
3252
    }
3253

    
3254
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3255
    else  return score1 + FFABS(score2)*8;
3256
}
3257

    
3258
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3259
    MpegEncContext *c = v;
3260
    int score1=0;
3261
    int score2=0;
3262
    int x,y;
3263

    
3264
    for(y=0; y<h; y++){
3265
        for(x=0; x<8; x++){
3266
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3267
        }
3268
        if(y+1<h){
3269
            for(x=0; x<7; x++){
3270
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3271
                             - s1[x+1] + s1[x+1+stride])
3272
                        -FFABS(  s2[x  ] - s2[x  +stride]
3273
                             - s2[x+1] + s2[x+1+stride]);
3274
            }
3275
        }
3276
        s1+= stride;
3277
        s2+= stride;
3278
    }
3279

    
3280
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3281
    else  return score1 + FFABS(score2)*8;
3282
}
3283

    
3284
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3285
    int i;
3286
    unsigned int sum=0;
3287

    
3288
    for(i=0; i<8*8; i++){
3289
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3290
        int w= weight[i];
3291
        b>>= RECON_SHIFT;
3292
        assert(-512<b && b<512);
3293

    
3294
        sum += (w*b)*(w*b)>>4;
3295
    }
3296
    return sum>>2;
3297
}
3298

    
3299
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3300
    int i;
3301

    
3302
    for(i=0; i<8*8; i++){
3303
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3304
    }
3305
}
3306

    
3307
/**
3308
 * permutes an 8x8 block.
3309
 * @param block the block which will be permuted according to the given permutation vector
3310
 * @param permutation the permutation vector
3311
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3312
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3313
 *                  (inverse) permutated to scantable order!
3314
 */
3315
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3316
{
3317
    int i;
3318
    DCTELEM temp[64];
3319

    
3320
    if(last<=0) return;
3321
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3322

    
3323
    for(i=0; i<=last; i++){
3324
        const int j= scantable[i];
3325
        temp[j]= block[j];
3326
        block[j]=0;
3327
    }
3328

    
3329
    for(i=0; i<=last; i++){
3330
        const int j= scantable[i];
3331
        const int perm_j= permutation[j];
3332
        block[perm_j]= temp[j];
3333
    }
3334
}
3335

    
3336
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3337
    return 0;
3338
}
3339

    
3340
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3341
    int i;
3342

    
3343
    memset(cmp, 0, sizeof(void*)*5);
3344

    
3345
    for(i=0; i<5; i++){
3346
        switch(type&0xFF){
3347
        case FF_CMP_SAD:
3348
            cmp[i]= c->sad[i];
3349
            break;
3350
        case FF_CMP_SATD:
3351
            cmp[i]= c->hadamard8_diff[i];
3352
            break;
3353
        case FF_CMP_SSE:
3354
            cmp[i]= c->sse[i];
3355
            break;
3356
        case FF_CMP_DCT:
3357
            cmp[i]= c->dct_sad[i];
3358
            break;
3359
        case FF_CMP_DCT264:
3360
            cmp[i]= c->dct264_sad[i];
3361
            break;
3362
        case FF_CMP_DCTMAX:
3363
            cmp[i]= c->dct_max[i];
3364
            break;
3365
        case FF_CMP_PSNR:
3366
            cmp[i]= c->quant_psnr[i];
3367
            break;
3368
        case FF_CMP_BIT:
3369
            cmp[i]= c->bit[i];
3370
            break;
3371
        case FF_CMP_RD:
3372
            cmp[i]= c->rd[i];
3373
            break;
3374
        case FF_CMP_VSAD:
3375
            cmp[i]= c->vsad[i];
3376
            break;
3377
        case FF_CMP_VSSE:
3378
            cmp[i]= c->vsse[i];
3379
            break;
3380
        case FF_CMP_ZERO:
3381
            cmp[i]= zero_cmp;
3382
            break;
3383
        case FF_CMP_NSSE:
3384
            cmp[i]= c->nsse[i];
3385
            break;
3386
#ifdef CONFIG_SNOW_ENCODER
3387
        case FF_CMP_W53:
3388
            cmp[i]= c->w53[i];
3389
            break;
3390
        case FF_CMP_W97:
3391
            cmp[i]= c->w97[i];
3392
            break;
3393
#endif
3394
        default:
3395
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3396
        }
3397
    }
3398
}
3399

    
3400
/**
3401
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3402
 */
3403
static void clear_blocks_c(DCTELEM *blocks)
3404
{
3405
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3406
}
3407

    
3408
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3409
    long i;
3410
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3411
        long a = *(long*)(src+i);
3412
        long b = *(long*)(dst+i);
3413
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3414
    }
3415
    for(; i<w; i++)
3416
        dst[i+0] += src[i+0];
3417
}
3418

    
3419
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3420
    long i;
3421
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3422
        long a = *(long*)(src1+i);
3423
        long b = *(long*)(src2+i);
3424
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3425
    }
3426
    for(; i<w; i++)
3427
        dst[i] = src1[i]+src2[i];
3428
}
3429

    
3430
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3431
    long i;
3432
#ifndef HAVE_FAST_UNALIGNED
3433
    if((long)src2 & (sizeof(long)-1)){
3434
        for(i=0; i+7<w; i+=8){
3435
            dst[i+0] = src1[i+0]-src2[i+0];
3436
            dst[i+1] = src1[i+1]-src2[i+1];
3437
            dst[i+2] = src1[i+2]-src2[i+2];
3438
            dst[i+3] = src1[i+3]-src2[i+3];
3439
            dst[i+4] = src1[i+4]-src2[i+4];
3440
            dst[i+5] = src1[i+5]-src2[i+5];
3441
            dst[i+6] = src1[i+6]-src2[i+6];
3442
            dst[i+7] = src1[i+7]-src2[i+7];
3443
        }
3444
    }else
3445
#endif
3446
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3447
        long a = *(long*)(src1+i);
3448
        long b = *(long*)(src2+i);
3449
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3450
    }
3451
    for(; i<w; i++)
3452
        dst[i+0] = src1[i+0]-src2[i+0];
3453
}
3454

    
3455
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3456
    int i;
3457
    uint8_t l, lt;
3458

    
3459
    l= *left;
3460
    lt= *left_top;
3461

    
3462
    for(i=0; i<w; i++){
3463
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3464
        lt= src1[i];
3465
        l= src2[i];
3466
        dst[i]= l - pred;
3467
    }
3468

    
3469
    *left= l;
3470
    *left_top= lt;
3471
}
3472

    
3473
#define BUTTERFLY2(o1,o2,i1,i2) \
3474
o1= (i1)+(i2);\
3475
o2= (i1)-(i2);
3476

    
3477
#define BUTTERFLY1(x,y) \
3478
{\
3479
    int a,b;\
3480
    a= x;\
3481
    b= y;\
3482
    x= a+b;\
3483
    y= a-b;\
3484
}
3485

    
3486
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3487

    
3488
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3489
    int i;
3490
    int temp[64];
3491
    int sum=0;
3492

    
3493
    assert(h==8);
3494

    
3495
    for(i=0; i<8; i++){
3496
        //FIXME try pointer walks
3497
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3498
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3499
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3500
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3501

    
3502
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3503
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3504
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3505
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3506

    
3507
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3508
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3509
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3510
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3511
    }
3512

    
3513
    for(i=0; i<8; i++){
3514
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3515
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3516
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3517
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3518

    
3519
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3520
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3521
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3522
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3523

    
3524
        sum +=
3525
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3526
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3527
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3528
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3529
    }
3530
#if 0
3531
static int maxi=0;
3532
if(sum>maxi){
3533
    maxi=sum;
3534
    printf("MAX:%d\n", maxi);
3535
}
3536
#endif
3537
    return sum;
3538
}
3539

    
3540
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3541
    int i;
3542
    int temp[64];
3543
    int sum=0;
3544

    
3545
    assert(h==8);
3546

    
3547
    for(i=0; i<8; i++){
3548
        //FIXME try pointer walks
3549
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3550
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3551
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3552
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3553

    
3554
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3555
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3556
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3557
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3558

    
3559
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3560
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3561
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3562
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3563
    }
3564

    
3565
    for(i=0; i<8; i++){
3566
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3567
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3568
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3569
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3570

    
3571
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3572
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3573
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3574
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3575

    
3576
        sum +=
3577
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3578
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3579
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3580
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3581
    }
3582

    
3583
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3584

    
3585
    return sum;
3586
}
3587

    
3588
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3589
    MpegEncContext * const s= (MpegEncContext *)c;
3590
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3591
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3592

    
3593
    assert(h==8);
3594

    
3595
    s->dsp.diff_pixels(temp, src1, src2, stride);
3596
    s->dsp.fdct(temp);
3597
    return s->dsp.sum_abs_dctelem(temp);
3598
}
3599

    
3600
#ifdef CONFIG_GPL
3601
#define DCT8_1D {\
3602
    const int s07 = SRC(0) + SRC(7);\
3603
    const int s16 = SRC(1) + SRC(6);\
3604
    const int s25 = SRC(2) + SRC(5);\
3605
    const int s34 = SRC(3) + SRC(4);\
3606
    const int a0 = s07 + s34;\
3607
    const int a1 = s16 + s25;\
3608
    const int a2 = s07 - s34;\
3609
    const int a3 = s16 - s25;\
3610
    const int d07 = SRC(0) - SRC(7);\
3611
    const int d16 = SRC(1) - SRC(6);\
3612
    const int d25 = SRC(2) - SRC(5);\
3613
    const int d34 = SRC(3) - SRC(4);\
3614
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3615
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3616
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3617
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3618
    DST(0,  a0 + a1     ) ;\
3619
    DST(1,  a4 + (a7>>2)) ;\
3620
    DST(2,  a2 + (a3>>1)) ;\
3621
    DST(3,  a5 + (a6>>2)) ;\
3622
    DST(4,  a0 - a1     ) ;\
3623
    DST(5,  a6 - (a5>>2)) ;\
3624
    DST(6, (a2>>1) - a3 ) ;\
3625
    DST(7, (a4>>2) - a7 ) ;\
3626
}
3627

    
3628
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3629
    MpegEncContext * const s= (MpegEncContext *)c;
3630
    DCTELEM dct[8][8];
3631
    int i;
3632
    int sum=0;
3633

    
3634
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3635

    
3636
#define SRC(x) dct[i][x]
3637
#define DST(x,v) dct[i][x]= v
3638
    for( i = 0; i < 8; i++ )
3639
        DCT8_1D
3640
#undef SRC
3641
#undef DST
3642

    
3643
#define SRC(x) dct[x][i]
3644
#define DST(x,v) sum += FFABS(v)
3645
    for( i = 0; i < 8; i++ )
3646
        DCT8_1D
3647
#undef SRC
3648
#undef DST
3649
    return sum;
3650
}
3651
#endif
3652

    
3653
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3654
    MpegEncContext * const s= (MpegEncContext *)c;
3655
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3656
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3657
    int sum=0, i;
3658

    
3659
    assert(h==8);
3660

    
3661
    s->dsp.diff_pixels(temp, src1, src2, stride);
3662
    s->dsp.fdct(temp);
3663

    
3664
    for(i=0; i<64; i++)
3665
        sum= FFMAX(sum, FFABS(temp[i]));
3666

    
3667
    return sum;
3668
}
3669

    
3670
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3671
    MpegEncContext * const s= (MpegEncContext *)c;
3672
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3673
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3674
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3675
    int sum=0, i;
3676

    
3677
    assert(h==8);
3678
    s->mb_intra=0;
3679

    
3680
    s->dsp.diff_pixels(temp, src1, src2, stride);
3681

    
3682
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3683

    
3684
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3685
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3686
    ff_simple_idct(temp); //FIXME
3687

    
3688
    for(i=0; i<64; i++)
3689
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3690

    
3691
    return sum;
3692
}
3693

    
3694
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3695
    MpegEncContext * const s= (MpegEncContext *)c;
3696
    const uint8_t *scantable= s->intra_scantable.permutated;
3697
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3698
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3699
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3700
    uint8_t * const bak= (uint8_t*)aligned_bak;
3701
    int i, last, run, bits, level, distortion, start_i;
3702
    const int esc_length= s->ac_esc_length;
3703
    uint8_t * length;
3704
    uint8_t * last_length;
3705

    
3706
    assert(h==8);
3707

    
3708
    for(i=0; i<8; i++){
3709
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3710
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3711
    }
3712

    
3713
    s->dsp.diff_pixels(temp, src1, src2, stride);
3714

    
3715
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3716

    
3717
    bits=0;
3718

    
3719
    if (s->mb_intra) {
3720
        start_i = 1;
3721
        length     = s->intra_ac_vlc_length;
3722
        last_length= s->intra_ac_vlc_last_length;
3723
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3724
    } else {
3725
        start_i = 0;
3726
        length     = s->inter_ac_vlc_length;
3727
        last_length= s->inter_ac_vlc_last_length;
3728
    }
3729

    
3730
    if(last>=start_i){
3731
        run=0;
3732
        for(i=start_i; i<last; i++){
3733
            int j= scantable[i];
3734
            level= temp[j];
3735

    
3736
            if(level){
3737
                level+=64;
3738
                if((level&(~127)) == 0){
3739
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3740
                }else
3741
                    bits+= esc_length;
3742
                run=0;
3743
            }else
3744
                run++;
3745
        }
3746
        i= scantable[last];
3747

    
3748
        level= temp[i] + 64;
3749

    
3750
        assert(level - 64);
3751

    
3752
        if((level&(~127)) == 0){
3753
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3754
        }else
3755
            bits+= esc_length;
3756

    
3757
    }
3758

    
3759
    if(last>=0){
3760
        if(s->mb_intra)
3761
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3762
        else
3763
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3764
    }
3765

    
3766
    s->dsp.idct_add(bak, stride, temp);
3767

    
3768
    distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3769

    
3770
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3771
}
3772

    
3773
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3774
    MpegEncContext * const s= (MpegEncContext *)c;
3775
    const uint8_t *scantable= s->intra_scantable.permutated;
3776
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3777
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3778
    int i, last, run, bits, level, start_i;
3779
    const int esc_length= s->ac_esc_length;
3780
    uint8_t * length;
3781
    uint8_t * last_length;
3782

    
3783
    assert(h==8);
3784

    
3785
    s->dsp.diff_pixels(temp, src1, src2, stride);
3786

    
3787
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3788

    
3789
    bits=0;
3790

    
3791
    if (s->mb_intra) {
3792
        start_i = 1;
3793
        length     = s->intra_ac_vlc_length;
3794
        last_length= s->intra_ac_vlc_last_length;
3795
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3796
    } else {
3797
        start_i = 0;
3798
        length     = s->inter_ac_vlc_length;
3799
        last_length= s->inter_ac_vlc_last_length;
3800
    }
3801

    
3802
    if(last>=start_i){
3803
        run=0;
3804
        for(i=start_i; i<last; i++){
3805
            int j= scantable[i];
3806
            level= temp[j];
3807

    
3808
            if(level){
3809
                level+=64;
3810
                if((level&(~127)) == 0){
3811
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3812
                }else
3813
                    bits+= esc_length;
3814
                run=0;
3815
            }else
3816
                run++;
3817
        }
3818
        i= scantable[last];
3819

    
3820
        level= temp[i] + 64;
3821

    
3822
        assert(level - 64);
3823

    
3824
        if((level&(~127)) == 0){
3825
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3826
        }else
3827
            bits+= esc_length;
3828
    }
3829

    
3830
    return bits;
3831
}
3832

    
3833
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3834
    int score=0;
3835
    int x,y;
3836

    
3837
    for(y=1; y<h; y++){
3838
        for(x=0; x<16; x+=4){
3839
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3840
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3841
        }
3842
        s+= stride;
3843
    }
3844

    
3845
    return score;
3846
}
3847

    
3848
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3849
    int score=0;
3850
    int x,y;
3851

    
3852
    for(y=1; y<h; y++){
3853
        for(x=0; x<16; x++){
3854
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3855
        }
3856
        s1+= stride;
3857
        s2+= stride;
3858
    }
3859

    
3860
    return score;
3861
}
3862

    
3863
#define SQ(a) ((a)*(a))
3864
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3865
    int score=0;
3866
    int x,y;
3867

    
3868
    for(y=1; y<h; y++){
3869
        for(x=0; x<16; x+=4){
3870
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3871
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3872
        }
3873
        s+= stride;
3874
    }
3875

    
3876
    return score;
3877
}
3878

    
3879
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3880
    int score=0;
3881
    int x,y;
3882

    
3883
    for(y=1; y<h; y++){
3884
        for(x=0; x<16; x++){
3885
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3886
        }
3887
        s1+= stride;
3888
        s2+= stride;
3889
    }
3890

    
3891
    return score;
3892
}
3893

    
3894
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3895
                               int size){
3896
    int score=0;
3897
    int i;
3898
    for(i=0; i<size; i++)
3899
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3900
    return score;
3901
}
3902

    
3903
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3904
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3905
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3906
#ifdef CONFIG_GPL
3907
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3908
#endif
3909
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3910
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3911
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3912
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3913

    
3914
static void vector_fmul_c(float *dst, const float *src, int len){
3915
    int i;
3916
    for(i=0; i<len; i++)
3917
        dst[i] *= src[i];
3918
}
3919

    
3920
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3921
    int i;
3922
    src1 += len-1;
3923
    for(i=0; i<len; i++)
3924
        dst[i] = src0[i] * src1[-i];
3925
}
3926

    
3927
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3928
    int i;
3929
    for(i=0; i<len; i++)
3930
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3931
}
3932

    
3933
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3934
    int i;
3935
    for(i=0; i<len; i++)
3936
        dst[i] = src0[i]*win[len-i-1] + src1[i]*win[i] + add_bias;
3937
}
3938

    
3939
static av_always_inline int float_to_int16_one(const float *src){
3940
    int_fast32_t tmp = *(const int32_t*)src;
3941
    if(tmp & 0xf0000){
3942
        tmp = (0x43c0ffff - tmp)>>31;
3943
        // is this faster on some gcc/cpu combinations?
3944
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3945
//      else                 tmp = 0;
3946
    }
3947
    return tmp - 0x8000;
3948
}
3949

    
3950
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3951
    int i;
3952
    for(i=0; i<len; i++)
3953
        dst[i] = float_to_int16_one(src+i);
3954
}
3955

    
3956
void ff_float_to_int16_interleave_c(int16_t *dst, const float *src, long len, int channels){
3957
    int i,j,c;
3958
    if(channels==2){
3959
        for(i=0; i<len; i++){
3960
            dst[2*i]   = float_to_int16_one(src+i);
3961
            dst[2*i+1] = float_to_int16_one(src+i+len);
3962
        }
3963
    }else{
3964
        for(c=0; c<channels; c++, src+=len)
3965
            for(i=0, j=c; i<len; i++, j+=channels)
3966
                dst[j] = float_to_int16_one(src+i);
3967
    }
3968
}
3969

    
3970
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3971
{
3972
    while (order--)
3973
       *v1++ += *v2++;
3974
}
3975

    
3976
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3977
{
3978
    while (order--)
3979
        *v1++ -= *v2++;
3980
}
3981

    
3982
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3983
{
3984
    int res = 0;
3985

    
3986
    while (order--)
3987
        res += (*v1++ * *v2++) >> shift;
3988

    
3989
    return res;
3990
}
3991

    
3992
#define W0 2048
3993
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3994
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3995
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3996
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3997
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3998
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3999
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4000

    
4001
static void wmv2_idct_row(short * b)
4002
{
4003
    int s1,s2;
4004
    int a0,a1,a2,a3,a4,a5,a6,a7;
4005
    /*step 1*/
4006
    a1 = W1*b[1]+W7*b[7];
4007
    a7 = W7*b[1]-W1*b[7];
4008
    a5 = W5*b[5]+W3*b[3];
4009
    a3 = W3*b[5]-W5*b[3];
4010
    a2 = W2*b[2]+W6*b[6];
4011
    a6 = W6*b[2]-W2*b[6];
4012
    a0 = W0*b[0]+W0*b[4];
4013
    a4 = W0*b[0]-W0*b[4];
4014
    /*step 2*/
4015
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4016
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4017
    /*step 3*/
4018
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4019
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4020
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4021
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4022
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4023
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4024
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4025
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4026
}
4027
static void wmv2_idct_col(short * b)
4028
{
4029
    int s1,s2;
4030
    int a0,a1,a2,a3,a4,a5,a6,a7;
4031
    /*step 1, with extended precision*/
4032
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4033
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4034
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4035
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4036
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4037
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4038
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4039
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4040
    /*step 2*/
4041
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4042
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4043
    /*step 3*/
4044
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4045
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4046
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4047
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4048

    
4049
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4050
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4051
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4052
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4053
}
4054
void ff_wmv2_idct_c(short * block){
4055
    int i;
4056

    
4057
    for(i=0;i<64;i+=8){
4058
        wmv2_idct_row(block+i);
4059
    }
4060
    for(i=0;i<8;i++){
4061
        wmv2_idct_col(block+i);
4062
    }
4063
}
4064
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4065
 converted */
4066
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4067
{
4068
    ff_wmv2_idct_c(block);
4069
    put_pixels_clamped_c(block, dest, line_size);
4070
}
4071
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4072
{
4073
    ff_wmv2_idct_c(block);
4074
    add_pixels_clamped_c(block, dest, line_size);
4075
}
4076
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4077
{
4078
    j_rev_dct (block);
4079
    put_pixels_clamped_c(block, dest, line_size);
4080
}
4081
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4082
{
4083
    j_rev_dct (block);
4084
    add_pixels_clamped_c(block, dest, line_size);
4085
}
4086

    
4087
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4088
{
4089
    j_rev_dct4 (block);
4090
    put_pixels_clamped4_c(block, dest, line_size);
4091
}
4092
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4093
{
4094
    j_rev_dct4 (block);
4095
    add_pixels_clamped4_c(block, dest, line_size);
4096
}
4097

    
4098
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4099
{
4100
    j_rev_dct2 (block);
4101
    put_pixels_clamped2_c(block, dest, line_size);
4102
}
4103
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4104
{
4105
    j_rev_dct2 (block);
4106
    add_pixels_clamped2_c(block, dest, line_size);
4107
}
4108

    
4109
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4110
{
4111
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4112

    
4113
    dest[0] = cm[(block[0] + 4)>>3];
4114
}
4115
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4116
{
4117
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4118

    
4119
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4120
}
4121

    
4122
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4123

    
4124
/* init static data */
4125
void dsputil_static_init(void)
4126
{
4127
    int i;
4128

    
4129
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4130
    for(i=0;i<MAX_NEG_CROP;i++) {
4131
        ff_cropTbl[i] = 0;
4132
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4133
    }
4134

    
4135
    for(i=0;i<512;i++) {
4136
        ff_squareTbl[i] = (i - 256) * (i - 256);
4137
    }
4138

    
4139
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4140
}
4141

    
4142
int ff_check_alignment(void){
4143
    static int did_fail=0;
4144
    DECLARE_ALIGNED_16(int, aligned);
4145

    
4146
    if((long)&aligned & 15){
4147
        if(!did_fail){
4148
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4149
            av_log(NULL, AV_LOG_ERROR,
4150
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4151
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4152
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4153
                "Do not report crashes to FFmpeg developers.\n");
4154
#endif
4155
            did_fail=1;
4156
        }
4157
        return -1;
4158
    }
4159
    return 0;
4160
}
4161

    
4162
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4163
{
4164
    int i;
4165

    
4166
    ff_check_alignment();
4167

    
4168
#ifdef CONFIG_ENCODERS
4169
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4170
        c->fdct = fdct_ifast;
4171
        c->fdct248 = fdct_ifast248;
4172
    }
4173
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4174
        c->fdct = ff_faandct;
4175
        c->fdct248 = ff_faandct248;
4176
    }
4177
    else {
4178
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4179
        c->fdct248 = ff_fdct248_islow;
4180
    }
4181
#endif //CONFIG_ENCODERS
4182

    
4183
    if(avctx->lowres==1){
4184
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4185
            c->idct_put= ff_jref_idct4_put;
4186
            c->idct_add= ff_jref_idct4_add;
4187
        }else{
4188
            c->idct_put= ff_h264_lowres_idct_put_c;
4189
            c->idct_add= ff_h264_lowres_idct_add_c;
4190
        }
4191
        c->idct    = j_rev_dct4;
4192
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4193
    }else if(avctx->lowres==2){
4194
        c->idct_put= ff_jref_idct2_put;
4195
        c->idct_add= ff_jref_idct2_add;
4196
        c->idct    = j_rev_dct2;
4197
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4198
    }else if(avctx->lowres==3){
4199
        c->idct_put= ff_jref_idct1_put;
4200
        c->idct_add= ff_jref_idct1_add;
4201
        c->idct    = j_rev_dct1;
4202
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4203
    }else{
4204
        if(avctx->idct_algo==FF_IDCT_INT){
4205
            c->idct_put= ff_jref_idct_put;
4206
            c->idct_add= ff_jref_idct_add;
4207
            c->idct    = j_rev_dct;
4208
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4209
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4210
                avctx->idct_algo==FF_IDCT_VP3){
4211
            c->idct_put= ff_vp3_idct_put_c;
4212
            c->idct_add= ff_vp3_idct_add_c;
4213
            c->idct    = ff_vp3_idct_c;
4214
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4215
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4216
            c->idct_put= ff_wmv2_idct_put_c;
4217
            c->idct_add= ff_wmv2_idct_add_c;
4218
            c->idct    = ff_wmv2_idct_c;
4219
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4220
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4221
            c->idct_put= ff_faanidct_put;
4222
            c->idct_add= ff_faanidct_add;
4223
            c->idct    = ff_faanidct;
4224
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4225
        }else{ //accurate/default
4226
            c->idct_put= ff_simple_idct_put;
4227
            c->idct_add= ff_simple_idct_add;
4228
            c->idct    = ff_simple_idct;
4229
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4230
        }
4231
    }
4232

    
4233
    if (ENABLE_H264_DECODER) {
4234
        c->h264_idct_add= ff_h264_idct_add_c;
4235
        c->h264_idct8_add= ff_h264_idct8_add_c;
4236
        c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4237
        c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4238
    }
4239

    
4240
    c->get_pixels = get_pixels_c;
4241
    c->diff_pixels = diff_pixels_c;
4242
    c->put_pixels_clamped = put_pixels_clamped_c;
4243
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4244
    c->add_pixels_clamped = add_pixels_clamped_c;
4245
    c->add_pixels8 = add_pixels8_c;
4246
    c->add_pixels4 = add_pixels4_c;
4247
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4248
    c->gmc1 = gmc1_c;
4249
    c->gmc = ff_gmc_c;
4250
    c->clear_blocks = clear_blocks_c;
4251
    c->pix_sum = pix_sum_c;
4252
    c->pix_norm1 = pix_norm1_c;
4253

    
4254
    /* TODO [0] 16  [1] 8 */