Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 911e21a3

History | View | Annotate | Download (160 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* ac3dec.c */
45
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
46

    
47
/* flacenc.c */
48
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
49

    
50
/* pngdec.c */
51
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
52

    
53
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
54
uint32_t ff_squareTbl[512] = {0, };
55

    
56
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57
#define pb_7f (~0UL/255 * 0x7f)
58
#define pb_80 (~0UL/255 * 0x80)
59

    
60
const uint8_t ff_zigzag_direct[64] = {
61
    0,   1,  8, 16,  9,  2,  3, 10,
62
    17, 24, 32, 25, 18, 11,  4,  5,
63
    12, 19, 26, 33, 40, 48, 41, 34,
64
    27, 20, 13,  6,  7, 14, 21, 28,
65
    35, 42, 49, 56, 57, 50, 43, 36,
66
    29, 22, 15, 23, 30, 37, 44, 51,
67
    58, 59, 52, 45, 38, 31, 39, 46,
68
    53, 60, 61, 54, 47, 55, 62, 63
69
};
70

    
71
/* Specific zigzag scan for 248 idct. NOTE that unlike the
72
   specification, we interleave the fields */
73
const uint8_t ff_zigzag248_direct[64] = {
74
     0,  8,  1,  9, 16, 24,  2, 10,
75
    17, 25, 32, 40, 48, 56, 33, 41,
76
    18, 26,  3, 11,  4, 12, 19, 27,
77
    34, 42, 49, 57, 50, 58, 35, 43,
78
    20, 28,  5, 13,  6, 14, 21, 29,
79
    36, 44, 51, 59, 52, 60, 37, 45,
80
    22, 30,  7, 15, 23, 31, 38, 46,
81
    53, 61, 54, 62, 39, 47, 55, 63,
82
};
83

    
84
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
86

    
87
const uint8_t ff_alternate_horizontal_scan[64] = {
88
    0,  1,   2,  3,  8,  9, 16, 17,
89
    10, 11,  4,  5,  6,  7, 15, 14,
90
    13, 12, 19, 18, 24, 25, 32, 33,
91
    26, 27, 20, 21, 22, 23, 28, 29,
92
    30, 31, 34, 35, 40, 41, 48, 49,
93
    42, 43, 36, 37, 38, 39, 44, 45,
94
    46, 47, 50, 51, 56, 57, 58, 59,
95
    52, 53, 54, 55, 60, 61, 62, 63,
96
};
97

    
98
const uint8_t ff_alternate_vertical_scan[64] = {
99
    0,  8,  16, 24,  1,  9,  2, 10,
100
    17, 25, 32, 40, 48, 56, 57, 49,
101
    41, 33, 26, 18,  3, 11,  4, 12,
102
    19, 27, 34, 42, 50, 58, 35, 43,
103
    51, 59, 20, 28,  5, 13,  6, 14,
104
    21, 29, 36, 44, 52, 60, 37, 45,
105
    53, 61, 22, 30,  7, 15, 23, 31,
106
    38, 46, 54, 62, 39, 47, 55, 63,
107
};
108

    
109
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
110
const uint32_t ff_inverse[256]={
111
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
112
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
113
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
114
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
115
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
116
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
117
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
118
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
119
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
120
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
121
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
122
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
123
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
124
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
125
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
126
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
127
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
128
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
129
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
130
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
131
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
132
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
133
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
134
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
135
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
136
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
137
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
138
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
139
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
140
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
141
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
142
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
143
};
144

    
145
/* Input permutation for the simple_idct_mmx */
146
static const uint8_t simple_mmx_permutation[64]={
147
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
148
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
149
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
150
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
151
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
152
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
153
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
154
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
155
};
156

    
157
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
158

    
159
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
160
    int i;
161
    int end;
162

    
163
    st->scantable= src_scantable;
164

    
165
    for(i=0; i<64; i++){
166
        int j;
167
        j = src_scantable[i];
168
        st->permutated[i] = permutation[j];
169
#ifdef ARCH_POWERPC
170
        st->inverse[j] = i;
171
#endif
172
    }
173

    
174
    end=-1;
175
    for(i=0; i<64; i++){
176
        int j;
177
        j = st->permutated[i];
178
        if(j>end) end=j;
179
        st->raster_end[i]= end;
180
    }
181
}
182

    
183
static int pix_sum_c(uint8_t * pix, int line_size)
184
{
185
    int s, i, j;
186

    
187
    s = 0;
188
    for (i = 0; i < 16; i++) {
189
        for (j = 0; j < 16; j += 8) {
190
            s += pix[0];
191
            s += pix[1];
192
            s += pix[2];
193
            s += pix[3];
194
            s += pix[4];
195
            s += pix[5];
196
            s += pix[6];
197
            s += pix[7];
198
            pix += 8;
199
        }
200
        pix += line_size - 16;
201
    }
202
    return s;
203
}
204

    
205
static int pix_norm1_c(uint8_t * pix, int line_size)
206
{
207
    int s, i, j;
208
    uint32_t *sq = ff_squareTbl + 256;
209

    
210
    s = 0;
211
    for (i = 0; i < 16; i++) {
212
        for (j = 0; j < 16; j += 8) {
213
#if 0
214
            s += sq[pix[0]];
215
            s += sq[pix[1]];
216
            s += sq[pix[2]];
217
            s += sq[pix[3]];
218
            s += sq[pix[4]];
219
            s += sq[pix[5]];
220
            s += sq[pix[6]];
221
            s += sq[pix[7]];
222
#else
223
#if LONG_MAX > 2147483647
224
            register uint64_t x=*(uint64_t*)pix;
225
            s += sq[x&0xff];
226
            s += sq[(x>>8)&0xff];
227
            s += sq[(x>>16)&0xff];
228
            s += sq[(x>>24)&0xff];
229
            s += sq[(x>>32)&0xff];
230
            s += sq[(x>>40)&0xff];
231
            s += sq[(x>>48)&0xff];
232
            s += sq[(x>>56)&0xff];
233
#else
234
            register uint32_t x=*(uint32_t*)pix;
235
            s += sq[x&0xff];
236
            s += sq[(x>>8)&0xff];
237
            s += sq[(x>>16)&0xff];
238
            s += sq[(x>>24)&0xff];
239
            x=*(uint32_t*)(pix+4);
240
            s += sq[x&0xff];
241
            s += sq[(x>>8)&0xff];
242
            s += sq[(x>>16)&0xff];
243
            s += sq[(x>>24)&0xff];
244
#endif
245
#endif
246
            pix += 8;
247
        }
248
        pix += line_size - 16;
249
    }
250
    return s;
251
}
252

    
253
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
254
    int i;
255

    
256
    for(i=0; i+8<=w; i+=8){
257
        dst[i+0]= bswap_32(src[i+0]);
258
        dst[i+1]= bswap_32(src[i+1]);
259
        dst[i+2]= bswap_32(src[i+2]);
260
        dst[i+3]= bswap_32(src[i+3]);
261
        dst[i+4]= bswap_32(src[i+4]);
262
        dst[i+5]= bswap_32(src[i+5]);
263
        dst[i+6]= bswap_32(src[i+6]);
264
        dst[i+7]= bswap_32(src[i+7]);
265
    }
266
    for(;i<w; i++){
267
        dst[i+0]= bswap_32(src[i+0]);
268
    }
269
}
270

    
271
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
272
{
273
    int s, i;
274
    uint32_t *sq = ff_squareTbl + 256;
275

    
276
    s = 0;
277
    for (i = 0; i < h; i++) {
278
        s += sq[pix1[0] - pix2[0]];
279
        s += sq[pix1[1] - pix2[1]];
280
        s += sq[pix1[2] - pix2[2]];
281
        s += sq[pix1[3] - pix2[3]];
282
        pix1 += line_size;
283
        pix2 += line_size;
284
    }
285
    return s;
286
}
287

    
288
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
289
{
290
    int s, i;
291
    uint32_t *sq = ff_squareTbl + 256;
292

    
293
    s = 0;
294
    for (i = 0; i < h; i++) {
295
        s += sq[pix1[0] - pix2[0]];
296
        s += sq[pix1[1] - pix2[1]];
297
        s += sq[pix1[2] - pix2[2]];
298
        s += sq[pix1[3] - pix2[3]];
299
        s += sq[pix1[4] - pix2[4]];
300
        s += sq[pix1[5] - pix2[5]];
301
        s += sq[pix1[6] - pix2[6]];
302
        s += sq[pix1[7] - pix2[7]];
303
        pix1 += line_size;
304
        pix2 += line_size;
305
    }
306
    return s;
307
}
308

    
309
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
310
{
311
    int s, i;
312
    uint32_t *sq = ff_squareTbl + 256;
313

    
314
    s = 0;
315
    for (i = 0; i < h; i++) {
316
        s += sq[pix1[ 0] - pix2[ 0]];
317
        s += sq[pix1[ 1] - pix2[ 1]];
318
        s += sq[pix1[ 2] - pix2[ 2]];
319
        s += sq[pix1[ 3] - pix2[ 3]];
320
        s += sq[pix1[ 4] - pix2[ 4]];
321
        s += sq[pix1[ 5] - pix2[ 5]];
322
        s += sq[pix1[ 6] - pix2[ 6]];
323
        s += sq[pix1[ 7] - pix2[ 7]];
324
        s += sq[pix1[ 8] - pix2[ 8]];
325
        s += sq[pix1[ 9] - pix2[ 9]];
326
        s += sq[pix1[10] - pix2[10]];
327
        s += sq[pix1[11] - pix2[11]];
328
        s += sq[pix1[12] - pix2[12]];
329
        s += sq[pix1[13] - pix2[13]];
330
        s += sq[pix1[14] - pix2[14]];
331
        s += sq[pix1[15] - pix2[15]];
332

    
333
        pix1 += line_size;
334
        pix2 += line_size;
335
    }
336
    return s;
337
}
338

    
339

    
340
#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
341
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
342
    int s, i, j;
343
    const int dec_count= w==8 ? 3 : 4;
344
    int tmp[32*32];
345
    int level, ori;
346
    static const int scale[2][2][4][4]={
347
      {
348
        {
349
            // 9/7 8x8 dec=3
350
            {268, 239, 239, 213},
351
            {  0, 224, 224, 152},
352
            {  0, 135, 135, 110},
353
        },{
354
            // 9/7 16x16 or 32x32 dec=4
355
            {344, 310, 310, 280},
356
            {  0, 320, 320, 228},
357
            {  0, 175, 175, 136},
358
            {  0, 129, 129, 102},
359
        }
360
      },{
361
        {
362
            // 5/3 8x8 dec=3
363
            {275, 245, 245, 218},
364
            {  0, 230, 230, 156},
365
            {  0, 138, 138, 113},
366
        },{
367
            // 5/3 16x16 or 32x32 dec=4
368
            {352, 317, 317, 286},
369
            {  0, 328, 328, 233},
370
            {  0, 180, 180, 140},
371
            {  0, 132, 132, 105},
372
        }
373
      }
374
    };
375

    
376
    for (i = 0; i < h; i++) {
377
        for (j = 0; j < w; j+=4) {
378
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
379
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
380
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
381
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
382
        }
383
        pix1 += line_size;
384
        pix2 += line_size;
385
    }
386

    
387
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
388

    
389
    s=0;
390
    assert(w==h);
391
    for(level=0; level<dec_count; level++){
392
        for(ori= level ? 1 : 0; ori<4; ori++){
393
            int size= w>>(dec_count-level);
394
            int sx= (ori&1) ? size : 0;
395
            int stride= 32<<(dec_count-level);
396
            int sy= (ori&2) ? stride>>1 : 0;
397

    
398
            for(i=0; i<size; i++){
399
                for(j=0; j<size; j++){
400
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
401
                    s += FFABS(v);
402
                }
403
            }
404
        }
405
    }
406
    assert(s>=0);
407
    return s>>9;
408
}
409

    
410
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
411
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
412
}
413

    
414
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
416
}
417

    
418
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
420
}
421

    
422
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
424
}
425

    
426
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
428
}
429

    
430
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
432
}
433
#endif
434

    
435
/* draw the edges of width 'w' of an image of size width, height */
436
//FIXME check that this is ok for mpeg4 interlaced
437
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
438
{
439
    uint8_t *ptr, *last_line;
440
    int i;
441

    
442
    last_line = buf + (height - 1) * wrap;
443
    for(i=0;i<w;i++) {
444
        /* top and bottom */
445
        memcpy(buf - (i + 1) * wrap, buf, width);
446
        memcpy(last_line + (i + 1) * wrap, last_line, width);
447
    }
448
    /* left and right */
449
    ptr = buf;
450
    for(i=0;i<height;i++) {
451
        memset(ptr - w, ptr[0], w);
452
        memset(ptr + width, ptr[width-1], w);
453
        ptr += wrap;
454
    }
455
    /* corners */
456
    for(i=0;i<w;i++) {
457
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
458
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
459
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
460
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
461
    }
462
}
463

    
464
/**
465
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
466
 * @param buf destination buffer
467
 * @param src source buffer
468
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
469
 * @param block_w width of block
470
 * @param block_h height of block
471
 * @param src_x x coordinate of the top left sample of the block in the source buffer
472
 * @param src_y y coordinate of the top left sample of the block in the source buffer
473
 * @param w width of the source buffer
474
 * @param h height of the source buffer
475
 */
476
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
477
                                    int src_x, int src_y, int w, int h){
478
    int x, y;
479
    int start_y, start_x, end_y, end_x;
480

    
481
    if(src_y>= h){
482
        src+= (h-1-src_y)*linesize;
483
        src_y=h-1;
484
    }else if(src_y<=-block_h){
485
        src+= (1-block_h-src_y)*linesize;
486
        src_y=1-block_h;
487
    }
488
    if(src_x>= w){
489
        src+= (w-1-src_x);
490
        src_x=w-1;
491
    }else if(src_x<=-block_w){
492
        src+= (1-block_w-src_x);
493
        src_x=1-block_w;
494
    }
495

    
496
    start_y= FFMAX(0, -src_y);
497
    start_x= FFMAX(0, -src_x);
498
    end_y= FFMIN(block_h, h-src_y);
499
    end_x= FFMIN(block_w, w-src_x);
500

    
501
    // copy existing part
502
    for(y=start_y; y<end_y; y++){
503
        for(x=start_x; x<end_x; x++){
504
            buf[x + y*linesize]= src[x + y*linesize];
505
        }
506
    }
507

    
508
    //top
509
    for(y=0; y<start_y; y++){
510
        for(x=start_x; x<end_x; x++){
511
            buf[x + y*linesize]= buf[x + start_y*linesize];
512
        }
513
    }
514

    
515
    //bottom
516
    for(y=end_y; y<block_h; y++){
517
        for(x=start_x; x<end_x; x++){
518
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
519
        }
520
    }
521

    
522
    for(y=0; y<block_h; y++){
523
       //left
524
        for(x=0; x<start_x; x++){
525
            buf[x + y*linesize]= buf[start_x + y*linesize];
526
        }
527

    
528
       //right
529
        for(x=end_x; x<block_w; x++){
530
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
531
        }
532
    }
533
}
534

    
535
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
536
{
537
    int i;
538

    
539
    /* read the pixels */
540
    for(i=0;i<8;i++) {
541
        block[0] = pixels[0];
542
        block[1] = pixels[1];
543
        block[2] = pixels[2];
544
        block[3] = pixels[3];
545
        block[4] = pixels[4];
546
        block[5] = pixels[5];
547
        block[6] = pixels[6];
548
        block[7] = pixels[7];
549
        pixels += line_size;
550
        block += 8;
551
    }
552
}
553

    
554
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
555
                          const uint8_t *s2, int stride){
556
    int i;
557

    
558
    /* read the pixels */
559
    for(i=0;i<8;i++) {
560
        block[0] = s1[0] - s2[0];
561
        block[1] = s1[1] - s2[1];
562
        block[2] = s1[2] - s2[2];
563
        block[3] = s1[3] - s2[3];
564
        block[4] = s1[4] - s2[4];
565
        block[5] = s1[5] - s2[5];
566
        block[6] = s1[6] - s2[6];
567
        block[7] = s1[7] - s2[7];
568
        s1 += stride;
569
        s2 += stride;
570
        block += 8;
571
    }
572
}
573

    
574

    
575
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
576
                                 int line_size)
577
{
578
    int i;
579
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
580

    
581
    /* read the pixels */
582
    for(i=0;i<8;i++) {
583
        pixels[0] = cm[block[0]];
584
        pixels[1] = cm[block[1]];
585
        pixels[2] = cm[block[2]];
586
        pixels[3] = cm[block[3]];
587
        pixels[4] = cm[block[4]];
588
        pixels[5] = cm[block[5]];
589
        pixels[6] = cm[block[6]];
590
        pixels[7] = cm[block[7]];
591

    
592
        pixels += line_size;
593
        block += 8;
594
    }
595
}
596

    
597
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
598
                                 int line_size)
599
{
600
    int i;
601
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
602

    
603
    /* read the pixels */
604
    for(i=0;i<4;i++) {
605
        pixels[0] = cm[block[0]];
606
        pixels[1] = cm[block[1]];
607
        pixels[2] = cm[block[2]];
608
        pixels[3] = cm[block[3]];
609

    
610
        pixels += line_size;
611
        block += 8;
612
    }
613
}
614

    
615
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
616
                                 int line_size)
617
{
618
    int i;
619
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
620

    
621
    /* read the pixels */
622
    for(i=0;i<2;i++) {
623
        pixels[0] = cm[block[0]];
624
        pixels[1] = cm[block[1]];
625

    
626
        pixels += line_size;
627
        block += 8;
628
    }
629
}
630

    
631
static void put_signed_pixels_clamped_c(const DCTELEM *block,
632
                                        uint8_t *restrict pixels,
633
                                        int line_size)
634
{
635
    int i, j;
636

    
637
    for (i = 0; i < 8; i++) {
638
        for (j = 0; j < 8; j++) {
639
            if (*block < -128)
640
                *pixels = 0;
641
            else if (*block > 127)
642
                *pixels = 255;
643
            else
644
                *pixels = (uint8_t)(*block + 128);
645
            block++;
646
            pixels++;
647
        }
648
        pixels += (line_size - 8);
649
    }
650
}
651

    
652
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
653
                          int line_size)
654
{
655
    int i;
656
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
657

    
658
    /* read the pixels */
659
    for(i=0;i<8;i++) {
660
        pixels[0] = cm[pixels[0] + block[0]];
661
        pixels[1] = cm[pixels[1] + block[1]];
662
        pixels[2] = cm[pixels[2] + block[2]];
663
        pixels[3] = cm[pixels[3] + block[3]];
664
        pixels[4] = cm[pixels[4] + block[4]];
665
        pixels[5] = cm[pixels[5] + block[5]];
666
        pixels[6] = cm[pixels[6] + block[6]];
667
        pixels[7] = cm[pixels[7] + block[7]];
668
        pixels += line_size;
669
        block += 8;
670
    }
671
}
672

    
673
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
674
                          int line_size)
675
{
676
    int i;
677
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
678

    
679
    /* read the pixels */
680
    for(i=0;i<4;i++) {
681
        pixels[0] = cm[pixels[0] + block[0]];
682
        pixels[1] = cm[pixels[1] + block[1]];
683
        pixels[2] = cm[pixels[2] + block[2]];
684
        pixels[3] = cm[pixels[3] + block[3]];
685
        pixels += line_size;
686
        block += 8;
687
    }
688
}
689

    
690
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
691
                          int line_size)
692
{
693
    int i;
694
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
695

    
696
    /* read the pixels */
697
    for(i=0;i<2;i++) {
698
        pixels[0] = cm[pixels[0] + block[0]];
699
        pixels[1] = cm[pixels[1] + block[1]];
700
        pixels += line_size;
701
        block += 8;
702
    }
703
}
704

    
705
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
706
{
707
    int i;
708
    for(i=0;i<8;i++) {
709
        pixels[0] += block[0];
710
        pixels[1] += block[1];
711
        pixels[2] += block[2];
712
        pixels[3] += block[3];
713
        pixels[4] += block[4];
714
        pixels[5] += block[5];
715
        pixels[6] += block[6];
716
        pixels[7] += block[7];
717
        pixels += line_size;
718
        block += 8;
719
    }
720
}
721

    
722
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
723
{
724
    int i;
725
    for(i=0;i<4;i++) {
726
        pixels[0] += block[0];
727
        pixels[1] += block[1];
728
        pixels[2] += block[2];
729
        pixels[3] += block[3];
730
        pixels += line_size;
731
        block += 4;
732
    }
733
}
734

    
735
static int sum_abs_dctelem_c(DCTELEM *block)
736
{
737
    int sum=0, i;
738
    for(i=0; i<64; i++)
739
        sum+= FFABS(block[i]);
740
    return sum;
741
}
742

    
743
#if 0
744

745
#define PIXOP2(OPNAME, OP) \
746
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
747
{\
748
    int i;\
749
    for(i=0; i<h; i++){\
750
        OP(*((uint64_t*)block), AV_RN64(pixels));\
751
        pixels+=line_size;\
752
        block +=line_size;\
753
    }\
754
}\
755
\
756
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
757
{\
758
    int i;\
759
    for(i=0; i<h; i++){\
760
        const uint64_t a= AV_RN64(pixels  );\
761
        const uint64_t b= AV_RN64(pixels+1);\
762
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
763
        pixels+=line_size;\
764
        block +=line_size;\
765
    }\
766
}\
767
\
768
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
769
{\
770
    int i;\
771
    for(i=0; i<h; i++){\
772
        const uint64_t a= AV_RN64(pixels  );\
773
        const uint64_t b= AV_RN64(pixels+1);\
774
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
775
        pixels+=line_size;\
776
        block +=line_size;\
777
    }\
778
}\
779
\
780
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
781
{\
782
    int i;\
783
    for(i=0; i<h; i++){\
784
        const uint64_t a= AV_RN64(pixels          );\
785
        const uint64_t b= AV_RN64(pixels+line_size);\
786
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
787
        pixels+=line_size;\
788
        block +=line_size;\
789
    }\
790
}\
791
\
792
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
793
{\
794
    int i;\
795
    for(i=0; i<h; i++){\
796
        const uint64_t a= AV_RN64(pixels          );\
797
        const uint64_t b= AV_RN64(pixels+line_size);\
798
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
799
        pixels+=line_size;\
800
        block +=line_size;\
801
    }\
802
}\
803
\
804
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
805
{\
806
        int i;\
807
        const uint64_t a= AV_RN64(pixels  );\
808
        const uint64_t b= AV_RN64(pixels+1);\
809
        uint64_t l0=  (a&0x0303030303030303ULL)\
810
                    + (b&0x0303030303030303ULL)\
811
                    + 0x0202020202020202ULL;\
812
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
813
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
814
        uint64_t l1,h1;\
815
\
816
        pixels+=line_size;\
817
        for(i=0; i<h; i+=2){\
818
            uint64_t a= AV_RN64(pixels  );\
819
            uint64_t b= AV_RN64(pixels+1);\
820
            l1=  (a&0x0303030303030303ULL)\
821
               + (b&0x0303030303030303ULL);\
822
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
823
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
824
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
825
            pixels+=line_size;\
826
            block +=line_size;\
827
            a= AV_RN64(pixels  );\
828
            b= AV_RN64(pixels+1);\
829
            l0=  (a&0x0303030303030303ULL)\
830
               + (b&0x0303030303030303ULL)\
831
               + 0x0202020202020202ULL;\
832
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
833
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
834
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
835
            pixels+=line_size;\
836
            block +=line_size;\
837
        }\
838
}\
839
\
840
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
841
{\
842
        int i;\
843
        const uint64_t a= AV_RN64(pixels  );\
844
        const uint64_t b= AV_RN64(pixels+1);\
845
        uint64_t l0=  (a&0x0303030303030303ULL)\
846
                    + (b&0x0303030303030303ULL)\
847
                    + 0x0101010101010101ULL;\
848
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
849
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
850
        uint64_t l1,h1;\
851
\
852
        pixels+=line_size;\
853
        for(i=0; i<h; i+=2){\
854
            uint64_t a= AV_RN64(pixels  );\
855
            uint64_t b= AV_RN64(pixels+1);\
856
            l1=  (a&0x0303030303030303ULL)\
857
               + (b&0x0303030303030303ULL);\
858
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
859
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
860
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
861
            pixels+=line_size;\
862
            block +=line_size;\
863
            a= AV_RN64(pixels  );\
864
            b= AV_RN64(pixels+1);\
865
            l0=  (a&0x0303030303030303ULL)\
866
               + (b&0x0303030303030303ULL)\
867
               + 0x0101010101010101ULL;\
868
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
869
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
870
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
871
            pixels+=line_size;\
872
            block +=line_size;\
873
        }\
874
}\
875
\
876
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
877
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
878
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
879
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
880
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
881
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
882
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
883

884
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
885
#else // 64 bit variant
886

    
887
#define PIXOP2(OPNAME, OP) \
888
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
889
    int i;\
890
    for(i=0; i<h; i++){\
891
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
892
        pixels+=line_size;\
893
        block +=line_size;\
894
    }\
895
}\
896
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897
    int i;\
898
    for(i=0; i<h; i++){\
899
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
900
        pixels+=line_size;\
901
        block +=line_size;\
902
    }\
903
}\
904
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905
    int i;\
906
    for(i=0; i<h; i++){\
907
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
908
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
909
        pixels+=line_size;\
910
        block +=line_size;\
911
    }\
912
}\
913
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
915
}\
916
\
917
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
918
                                                int src_stride1, int src_stride2, int h){\
919
    int i;\
920
    for(i=0; i<h; i++){\
921
        uint32_t a,b;\
922
        a= AV_RN32(&src1[i*src_stride1  ]);\
923
        b= AV_RN32(&src2[i*src_stride2  ]);\
924
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
925
        a= AV_RN32(&src1[i*src_stride1+4]);\
926
        b= AV_RN32(&src2[i*src_stride2+4]);\
927
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
928
    }\
929
}\
930
\
931
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
932
                                                int src_stride1, int src_stride2, int h){\
933
    int i;\
934
    for(i=0; i<h; i++){\
935
        uint32_t a,b;\
936
        a= AV_RN32(&src1[i*src_stride1  ]);\
937
        b= AV_RN32(&src2[i*src_stride2  ]);\
938
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
939
        a= AV_RN32(&src1[i*src_stride1+4]);\
940
        b= AV_RN32(&src2[i*src_stride2+4]);\
941
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
942
    }\
943
}\
944
\
945
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
946
                                                int src_stride1, int src_stride2, int h){\
947
    int i;\
948
    for(i=0; i<h; i++){\
949
        uint32_t a,b;\
950
        a= AV_RN32(&src1[i*src_stride1  ]);\
951
        b= AV_RN32(&src2[i*src_stride2  ]);\
952
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
953
    }\
954
}\
955
\
956
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
957
                                                int src_stride1, int src_stride2, int h){\
958
    int i;\
959
    for(i=0; i<h; i++){\
960
        uint32_t a,b;\
961
        a= AV_RN16(&src1[i*src_stride1  ]);\
962
        b= AV_RN16(&src2[i*src_stride2  ]);\
963
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
964
    }\
965
}\
966
\
967
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
968
                                                int src_stride1, int src_stride2, int h){\
969
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
970
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
971
}\
972
\
973
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
974
                                                int src_stride1, int src_stride2, int h){\
975
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
976
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
977
}\
978
\
979
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
980
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
981
}\
982
\
983
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
985
}\
986
\
987
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
989
}\
990
\
991
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
993
}\
994
\
995
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
996
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
997
    int i;\
998
    for(i=0; i<h; i++){\
999
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1000
        a= AV_RN32(&src1[i*src_stride1]);\
1001
        b= AV_RN32(&src2[i*src_stride2]);\
1002
        c= AV_RN32(&src3[i*src_stride3]);\
1003
        d= AV_RN32(&src4[i*src_stride4]);\
1004
        l0=  (a&0x03030303UL)\
1005
           + (b&0x03030303UL)\
1006
           + 0x02020202UL;\
1007
        h0= ((a&0xFCFCFCFCUL)>>2)\
1008
          + ((b&0xFCFCFCFCUL)>>2);\
1009
        l1=  (c&0x03030303UL)\
1010
           + (d&0x03030303UL);\
1011
        h1= ((c&0xFCFCFCFCUL)>>2)\
1012
          + ((d&0xFCFCFCFCUL)>>2);\
1013
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014
        a= AV_RN32(&src1[i*src_stride1+4]);\
1015
        b= AV_RN32(&src2[i*src_stride2+4]);\
1016
        c= AV_RN32(&src3[i*src_stride3+4]);\
1017
        d= AV_RN32(&src4[i*src_stride4+4]);\
1018
        l0=  (a&0x03030303UL)\
1019
           + (b&0x03030303UL)\
1020
           + 0x02020202UL;\
1021
        h0= ((a&0xFCFCFCFCUL)>>2)\
1022
          + ((b&0xFCFCFCFCUL)>>2);\
1023
        l1=  (c&0x03030303UL)\
1024
           + (d&0x03030303UL);\
1025
        h1= ((c&0xFCFCFCFCUL)>>2)\
1026
          + ((d&0xFCFCFCFCUL)>>2);\
1027
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1028
    }\
1029
}\
1030
\
1031
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1032
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1033
}\
1034
\
1035
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1037
}\
1038
\
1039
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1041
}\
1042
\
1043
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1045
}\
1046
\
1047
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1048
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1049
    int i;\
1050
    for(i=0; i<h; i++){\
1051
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1052
        a= AV_RN32(&src1[i*src_stride1]);\
1053
        b= AV_RN32(&src2[i*src_stride2]);\
1054
        c= AV_RN32(&src3[i*src_stride3]);\
1055
        d= AV_RN32(&src4[i*src_stride4]);\
1056
        l0=  (a&0x03030303UL)\
1057
           + (b&0x03030303UL)\
1058
           + 0x01010101UL;\
1059
        h0= ((a&0xFCFCFCFCUL)>>2)\
1060
          + ((b&0xFCFCFCFCUL)>>2);\
1061
        l1=  (c&0x03030303UL)\
1062
           + (d&0x03030303UL);\
1063
        h1= ((c&0xFCFCFCFCUL)>>2)\
1064
          + ((d&0xFCFCFCFCUL)>>2);\
1065
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1066
        a= AV_RN32(&src1[i*src_stride1+4]);\
1067
        b= AV_RN32(&src2[i*src_stride2+4]);\
1068
        c= AV_RN32(&src3[i*src_stride3+4]);\
1069
        d= AV_RN32(&src4[i*src_stride4+4]);\
1070
        l0=  (a&0x03030303UL)\
1071
           + (b&0x03030303UL)\
1072
           + 0x01010101UL;\
1073
        h0= ((a&0xFCFCFCFCUL)>>2)\
1074
          + ((b&0xFCFCFCFCUL)>>2);\
1075
        l1=  (c&0x03030303UL)\
1076
           + (d&0x03030303UL);\
1077
        h1= ((c&0xFCFCFCFCUL)>>2)\
1078
          + ((d&0xFCFCFCFCUL)>>2);\
1079
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080
    }\
1081
}\
1082
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1083
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1084
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1085
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1086
}\
1087
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1088
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1089
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091
}\
1092
\
1093
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1094
{\
1095
        int i, a0, b0, a1, b1;\
1096
        a0= pixels[0];\
1097
        b0= pixels[1] + 2;\
1098
        a0 += b0;\
1099
        b0 += pixels[2];\
1100
\
1101
        pixels+=line_size;\
1102
        for(i=0; i<h; i+=2){\
1103
            a1= pixels[0];\
1104
            b1= pixels[1];\
1105
            a1 += b1;\
1106
            b1 += pixels[2];\
1107
\
1108
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1109
            block[1]= (b1+b0)>>2;\
1110
\
1111
            pixels+=line_size;\
1112
            block +=line_size;\
1113
\
1114
            a0= pixels[0];\
1115
            b0= pixels[1] + 2;\
1116
            a0 += b0;\
1117
            b0 += pixels[2];\
1118
\
1119
            block[0]= (a1+a0)>>2;\
1120
            block[1]= (b1+b0)>>2;\
1121
            pixels+=line_size;\
1122
            block +=line_size;\
1123
        }\
1124
}\
1125
\
1126
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1127
{\
1128
        int i;\
1129
        const uint32_t a= AV_RN32(pixels  );\
1130
        const uint32_t b= AV_RN32(pixels+1);\
1131
        uint32_t l0=  (a&0x03030303UL)\
1132
                    + (b&0x03030303UL)\
1133
                    + 0x02020202UL;\
1134
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1135
                   + ((b&0xFCFCFCFCUL)>>2);\
1136
        uint32_t l1,h1;\
1137
\
1138
        pixels+=line_size;\
1139
        for(i=0; i<h; i+=2){\
1140
            uint32_t a= AV_RN32(pixels  );\
1141
            uint32_t b= AV_RN32(pixels+1);\
1142
            l1=  (a&0x03030303UL)\
1143
               + (b&0x03030303UL);\
1144
            h1= ((a&0xFCFCFCFCUL)>>2)\
1145
              + ((b&0xFCFCFCFCUL)>>2);\
1146
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1147
            pixels+=line_size;\
1148
            block +=line_size;\
1149
            a= AV_RN32(pixels  );\
1150
            b= AV_RN32(pixels+1);\
1151
            l0=  (a&0x03030303UL)\
1152
               + (b&0x03030303UL)\
1153
               + 0x02020202UL;\
1154
            h0= ((a&0xFCFCFCFCUL)>>2)\
1155
              + ((b&0xFCFCFCFCUL)>>2);\
1156
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1157
            pixels+=line_size;\
1158
            block +=line_size;\
1159
        }\
1160
}\
1161
\
1162
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1163
{\
1164
    int j;\
1165
    for(j=0; j<2; j++){\
1166
        int i;\
1167
        const uint32_t a= AV_RN32(pixels  );\
1168
        const uint32_t b= AV_RN32(pixels+1);\
1169
        uint32_t l0=  (a&0x03030303UL)\
1170
                    + (b&0x03030303UL)\
1171
                    + 0x02020202UL;\
1172
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1173
                   + ((b&0xFCFCFCFCUL)>>2);\
1174
        uint32_t l1,h1;\
1175
\
1176
        pixels+=line_size;\
1177
        for(i=0; i<h; i+=2){\
1178
            uint32_t a= AV_RN32(pixels  );\
1179
            uint32_t b= AV_RN32(pixels+1);\
1180
            l1=  (a&0x03030303UL)\
1181
               + (b&0x03030303UL);\
1182
            h1= ((a&0xFCFCFCFCUL)>>2)\
1183
              + ((b&0xFCFCFCFCUL)>>2);\
1184
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1185
            pixels+=line_size;\
1186
            block +=line_size;\
1187
            a= AV_RN32(pixels  );\
1188
            b= AV_RN32(pixels+1);\
1189
            l0=  (a&0x03030303UL)\
1190
               + (b&0x03030303UL)\
1191
               + 0x02020202UL;\
1192
            h0= ((a&0xFCFCFCFCUL)>>2)\
1193
              + ((b&0xFCFCFCFCUL)>>2);\
1194
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1195
            pixels+=line_size;\
1196
            block +=line_size;\
1197
        }\
1198
        pixels+=4-line_size*(h+1);\
1199
        block +=4-line_size*h;\
1200
    }\
1201
}\
1202
\
1203
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1204
{\
1205
    int j;\
1206
    for(j=0; j<2; j++){\
1207
        int i;\
1208
        const uint32_t a= AV_RN32(pixels  );\
1209
        const uint32_t b= AV_RN32(pixels+1);\
1210
        uint32_t l0=  (a&0x03030303UL)\
1211
                    + (b&0x03030303UL)\
1212
                    + 0x01010101UL;\
1213
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1214
                   + ((b&0xFCFCFCFCUL)>>2);\
1215
        uint32_t l1,h1;\
1216
\
1217
        pixels+=line_size;\
1218
        for(i=0; i<h; i+=2){\
1219
            uint32_t a= AV_RN32(pixels  );\
1220
            uint32_t b= AV_RN32(pixels+1);\
1221
            l1=  (a&0x03030303UL)\
1222
               + (b&0x03030303UL);\
1223
            h1= ((a&0xFCFCFCFCUL)>>2)\
1224
              + ((b&0xFCFCFCFCUL)>>2);\
1225
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1226
            pixels+=line_size;\
1227
            block +=line_size;\
1228
            a= AV_RN32(pixels  );\
1229
            b= AV_RN32(pixels+1);\
1230
            l0=  (a&0x03030303UL)\
1231
               + (b&0x03030303UL)\
1232
               + 0x01010101UL;\
1233
            h0= ((a&0xFCFCFCFCUL)>>2)\
1234
              + ((b&0xFCFCFCFCUL)>>2);\
1235
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1236
            pixels+=line_size;\
1237
            block +=line_size;\
1238
        }\
1239
        pixels+=4-line_size*(h+1);\
1240
        block +=4-line_size*h;\
1241
    }\
1242
}\
1243
\
1244
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1245
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1246
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1247
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1248
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1249
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1250
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1251
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1252

    
1253
#define op_avg(a, b) a = rnd_avg32(a, b)
1254
#endif
1255
#define op_put(a, b) a = b
1256

    
1257
PIXOP2(avg, op_avg)
1258
PIXOP2(put, op_put)
1259
#undef op_avg
1260
#undef op_put
1261

    
1262
#define avg2(a,b) ((a+b+1)>>1)
1263
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1264

    
1265
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1266
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1267
}
1268

    
1269
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1271
}
1272

    
1273
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1274
{
1275
    const int A=(16-x16)*(16-y16);
1276
    const int B=(   x16)*(16-y16);
1277
    const int C=(16-x16)*(   y16);
1278
    const int D=(   x16)*(   y16);
1279
    int i;
1280

    
1281
    for(i=0; i<h; i++)
1282
    {
1283
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1284
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1285
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1286
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1287
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1288
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1289
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1290
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1291
        dst+= stride;
1292
        src+= stride;
1293
    }
1294
}
1295

    
1296
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1297
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1298
{
1299
    int y, vx, vy;
1300
    const int s= 1<<shift;
1301

    
1302
    width--;
1303
    height--;
1304

    
1305
    for(y=0; y<h; y++){
1306
        int x;
1307

    
1308
        vx= ox;
1309
        vy= oy;
1310
        for(x=0; x<8; x++){ //XXX FIXME optimize
1311
            int src_x, src_y, frac_x, frac_y, index;
1312

    
1313
            src_x= vx>>16;
1314
            src_y= vy>>16;
1315
            frac_x= src_x&(s-1);
1316
            frac_y= src_y&(s-1);
1317
            src_x>>=shift;
1318
            src_y>>=shift;
1319

    
1320
            if((unsigned)src_x < width){
1321
                if((unsigned)src_y < height){
1322
                    index= src_x + src_y*stride;
1323
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1324
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1325
                                        + (  src[index+stride  ]*(s-frac_x)
1326
                                           + src[index+stride+1]*   frac_x )*   frac_y
1327
                                        + r)>>(shift*2);
1328
                }else{
1329
                    index= src_x + av_clip(src_y, 0, height)*stride;
1330
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1331
                                          + src[index       +1]*   frac_x )*s
1332
                                        + r)>>(shift*2);
1333
                }
1334
            }else{
1335
                if((unsigned)src_y < height){
1336
                    index= av_clip(src_x, 0, width) + src_y*stride;
1337
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1338
                                           + src[index+stride  ]*   frac_y )*s
1339
                                        + r)>>(shift*2);
1340
                }else{
1341
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1342
                    dst[y*stride + x]=    src[index         ];
1343
                }
1344
            }
1345

    
1346
            vx+= dxx;
1347
            vy+= dyx;
1348
        }
1349
        ox += dxy;
1350
        oy += dyy;
1351
    }
1352
}
1353

    
1354
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    switch(width){
1356
    case 2: put_pixels2_c (dst, src, stride, height); break;
1357
    case 4: put_pixels4_c (dst, src, stride, height); break;
1358
    case 8: put_pixels8_c (dst, src, stride, height); break;
1359
    case 16:put_pixels16_c(dst, src, stride, height); break;
1360
    }
1361
}
1362

    
1363
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1364
    int i,j;
1365
    for (i=0; i < height; i++) {
1366
      for (j=0; j < width; j++) {
1367
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1368
      }
1369
      src += stride;
1370
      dst += stride;
1371
    }
1372
}
1373

    
1374
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1375
    int i,j;
1376
    for (i=0; i < height; i++) {
1377
      for (j=0; j < width; j++) {
1378
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1379
      }
1380
      src += stride;
1381
      dst += stride;
1382
    }
1383
}
1384

    
1385
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386
    int i,j;
1387
    for (i=0; i < height; i++) {
1388
      for (j=0; j < width; j++) {
1389
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1390
      }
1391
      src += stride;
1392
      dst += stride;
1393
    }
1394
}
1395

    
1396
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397
    int i,j;
1398
    for (i=0; i < height; i++) {
1399
      for (j=0; j < width; j++) {
1400
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1401
      }
1402
      src += stride;
1403
      dst += stride;
1404
    }
1405
}
1406

    
1407
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408
    int i,j;
1409
    for (i=0; i < height; i++) {
1410
      for (j=0; j < width; j++) {
1411
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1412
      }
1413
      src += stride;
1414
      dst += stride;
1415
    }
1416
}
1417

    
1418
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419
    int i,j;
1420
    for (i=0; i < height; i++) {
1421
      for (j=0; j < width; j++) {
1422
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1423
      }
1424
      src += stride;
1425
      dst += stride;
1426
    }
1427
}
1428

    
1429
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430
    int i,j;
1431
    for (i=0; i < height; i++) {
1432
      for (j=0; j < width; j++) {
1433
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1434
      }
1435
      src += stride;
1436
      dst += stride;
1437
    }
1438
}
1439

    
1440
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441
    int i,j;
1442
    for (i=0; i < height; i++) {
1443
      for (j=0; j < width; j++) {
1444
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1445
      }
1446
      src += stride;
1447
      dst += stride;
1448
    }
1449
}
1450

    
1451
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452
    switch(width){
1453
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1454
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1455
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1456
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1457
    }
1458
}
1459

    
1460
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1461
    int i,j;
1462
    for (i=0; i < height; i++) {
1463
      for (j=0; j < width; j++) {
1464
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1465
      }
1466
      src += stride;
1467
      dst += stride;
1468
    }
1469
}
1470

    
1471
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1472
    int i,j;
1473
    for (i=0; i < height; i++) {
1474
      for (j=0; j < width; j++) {
1475
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1476
      }
1477
      src += stride;
1478
      dst += stride;
1479
    }
1480
}
1481

    
1482
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1483
    int i,j;
1484
    for (i=0; i < height; i++) {
1485
      for (j=0; j < width; j++) {
1486
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1487
      }
1488
      src += stride;
1489
      dst += stride;
1490
    }
1491
}
1492

    
1493
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1494
    int i,j;
1495
    for (i=0; i < height; i++) {
1496
      for (j=0; j < width; j++) {
1497
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1498
      }
1499
      src += stride;
1500
      dst += stride;
1501
    }
1502
}
1503

    
1504
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1505
    int i,j;
1506
    for (i=0; i < height; i++) {
1507
      for (j=0; j < width; j++) {
1508
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1509
      }
1510
      src += stride;
1511
      dst += stride;
1512
    }
1513
}
1514

    
1515
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1516
    int i,j;
1517
    for (i=0; i < height; i++) {
1518
      for (j=0; j < width; j++) {
1519
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1520
      }
1521
      src += stride;
1522
      dst += stride;
1523
    }
1524
}
1525

    
1526
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1527
    int i,j;
1528
    for (i=0; i < height; i++) {
1529
      for (j=0; j < width; j++) {
1530
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1531
      }
1532
      src += stride;
1533
      dst += stride;
1534
    }
1535
}
1536

    
1537
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1538
    int i,j;
1539
    for (i=0; i < height; i++) {
1540
      for (j=0; j < width; j++) {
1541
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1542
      }
1543
      src += stride;
1544
      dst += stride;
1545
    }
1546
}
1547
#if 0
1548
#define TPEL_WIDTH(width)\
1549
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1550
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1551
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1552
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1553
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1555
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1557
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1559
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1561
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1563
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1565
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1567
#endif
1568

    
1569
#define H264_CHROMA_MC(OPNAME, OP)\
1570
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1571
    const int A=(8-x)*(8-y);\
1572
    const int B=(  x)*(8-y);\
1573
    const int C=(8-x)*(  y);\
1574
    const int D=(  x)*(  y);\
1575
    int i;\
1576
    \
1577
    assert(x<8 && y<8 && x>=0 && y>=0);\
1578
\
1579
    if(D){\
1580
        for(i=0; i<h; i++){\
1581
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1582
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1583
            dst+= stride;\
1584
            src+= stride;\
1585
        }\
1586
    }else{\
1587
        const int E= B+C;\
1588
        const int step= C ? stride : 1;\
1589
        for(i=0; i<h; i++){\
1590
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1591
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1592
            dst+= stride;\
1593
            src+= stride;\
1594
        }\
1595
    }\
1596
}\
1597
\
1598
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1599
    const int A=(8-x)*(8-y);\
1600
    const int B=(  x)*(8-y);\
1601
    const int C=(8-x)*(  y);\
1602
    const int D=(  x)*(  y);\
1603
    int i;\
1604
    \
1605
    assert(x<8 && y<8 && x>=0 && y>=0);\
1606
\
1607
    if(D){\
1608
        for(i=0; i<h; i++){\
1609
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1610
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1611
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1612
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1613
            dst+= stride;\
1614
            src+= stride;\
1615
        }\
1616
    }else{\
1617
        const int E= B+C;\
1618
        const int step= C ? stride : 1;\
1619
        for(i=0; i<h; i++){\
1620
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1621
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1622
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1623
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1624
            dst+= stride;\
1625
            src+= stride;\
1626
        }\
1627
    }\
1628
}\
1629
\
1630
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1631
    const int A=(8-x)*(8-y);\
1632
    const int B=(  x)*(8-y);\
1633
    const int C=(8-x)*(  y);\
1634
    const int D=(  x)*(  y);\
1635
    int i;\
1636
    \
1637
    assert(x<8 && y<8 && x>=0 && y>=0);\
1638
\
1639
    if(D){\
1640
        for(i=0; i<h; i++){\
1641
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1642
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1643
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1644
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1645
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1646
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1647
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1648
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1649
            dst+= stride;\
1650
            src+= stride;\
1651
        }\
1652
    }else{\
1653
        const int E= B+C;\
1654
        const int step= C ? stride : 1;\
1655
        for(i=0; i<h; i++){\
1656
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1657
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1658
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1659
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1660
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1661
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1662
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1663
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1664
            dst+= stride;\
1665
            src+= stride;\
1666
        }\
1667
    }\
1668
}
1669

    
1670
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1671
#define op_put(a, b) a = (((b) + 32)>>6)
1672

    
1673
H264_CHROMA_MC(put_       , op_put)
1674
H264_CHROMA_MC(avg_       , op_avg)
1675
#undef op_avg
1676
#undef op_put
1677

    
1678
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1679
    const int A=(8-x)*(8-y);
1680
    const int B=(  x)*(8-y);
1681
    const int C=(8-x)*(  y);
1682
    const int D=(  x)*(  y);
1683
    int i;
1684

    
1685
    assert(x<8 && y<8 && x>=0 && y>=0);
1686

    
1687
    for(i=0; i<h; i++)
1688
    {
1689
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1690
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1691
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1692
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1693
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1694
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1695
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1696
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1697
        dst+= stride;
1698
        src+= stride;
1699
    }
1700
}
1701

    
1702
#define QPEL_MC(r, OPNAME, RND, OP) \
1703
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1704
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1705
    int i;\
1706
    for(i=0; i<h; i++)\
1707
    {\
1708
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1709
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1710
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1711
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1712
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1713
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1714
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1715
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1716
        dst+=dstStride;\
1717
        src+=srcStride;\
1718
    }\
1719
}\
1720
\
1721
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1722
    const int w=8;\
1723
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1724
    int i;\
1725
    for(i=0; i<w; i++)\
1726
    {\
1727
        const int src0= src[0*srcStride];\
1728
        const int src1= src[1*srcStride];\
1729
        const int src2= src[2*srcStride];\
1730
        const int src3= src[3*srcStride];\
1731
        const int src4= src[4*srcStride];\
1732
        const int src5= src[5*srcStride];\
1733
        const int src6= src[6*srcStride];\
1734
        const int src7= src[7*srcStride];\
1735
        const int src8= src[8*srcStride];\
1736
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1737
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1738
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1739
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1740
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1741
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1742
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1743
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1744
        dst++;\
1745
        src++;\
1746
    }\
1747
}\
1748
\
1749
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1750
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1751
    int i;\
1752
    \
1753
    for(i=0; i<h; i++)\
1754
    {\
1755
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1756
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1757
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1758
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1759
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1760
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1761
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1762
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1763
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1764
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1765
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1766
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1767
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1768
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1769
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1770
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1771
        dst+=dstStride;\
1772
        src+=srcStride;\
1773
    }\
1774
}\
1775
\
1776
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1777
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1778
    int i;\
1779
    const int w=16;\
1780
    for(i=0; i<w; i++)\
1781
    {\
1782
        const int src0= src[0*srcStride];\
1783
        const int src1= src[1*srcStride];\
1784
        const int src2= src[2*srcStride];\
1785
        const int src3= src[3*srcStride];\
1786
        const int src4= src[4*srcStride];\
1787
        const int src5= src[5*srcStride];\
1788
        const int src6= src[6*srcStride];\
1789
        const int src7= src[7*srcStride];\
1790
        const int src8= src[8*srcStride];\
1791
        const int src9= src[9*srcStride];\
1792
        const int src10= src[10*srcStride];\
1793
        const int src11= src[11*srcStride];\
1794
        const int src12= src[12*srcStride];\
1795
        const int src13= src[13*srcStride];\
1796
        const int src14= src[14*srcStride];\
1797
        const int src15= src[15*srcStride];\
1798
        const int src16= src[16*srcStride];\
1799
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1800
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1801
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1802
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1803
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1804
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1805
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1806
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1807
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1808
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1809
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1810
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1811
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1812
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1813
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1814
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1815
        dst++;\
1816
        src++;\
1817
    }\
1818
}\
1819
\
1820
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1821
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1822
}\
1823
\
1824
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1825
    uint8_t half[64];\
1826
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1827
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1828
}\
1829
\
1830
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1831
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1832
}\
1833
\
1834
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1835
    uint8_t half[64];\
1836
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1837
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1838
}\
1839
\
1840
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1841
    uint8_t full[16*9];\
1842
    uint8_t half[64];\
1843
    copy_block9(full, src, 16, stride, 9);\
1844
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1845
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1846
}\
1847
\
1848
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1849
    uint8_t full[16*9];\
1850
    copy_block9(full, src, 16, stride, 9);\
1851
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1852
}\
1853
\
1854
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1855
    uint8_t full[16*9];\
1856
    uint8_t half[64];\
1857
    copy_block9(full, src, 16, stride, 9);\
1858
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1859
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1860
}\
1861
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1862
    uint8_t full[16*9];\
1863
    uint8_t halfH[72];\
1864
    uint8_t halfV[64];\
1865
    uint8_t halfHV[64];\
1866
    copy_block9(full, src, 16, stride, 9);\
1867
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1868
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1869
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1870
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1871
}\
1872
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1873
    uint8_t full[16*9];\
1874
    uint8_t halfH[72];\
1875
    uint8_t halfHV[64];\
1876
    copy_block9(full, src, 16, stride, 9);\
1877
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1878
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1879
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1880
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1881
}\
1882
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1883
    uint8_t full[16*9];\
1884
    uint8_t halfH[72];\
1885
    uint8_t halfV[64];\
1886
    uint8_t halfHV[64];\
1887
    copy_block9(full, src, 16, stride, 9);\
1888
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1890
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1891
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1892
}\
1893
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1894
    uint8_t full[16*9];\
1895
    uint8_t halfH[72];\
1896
    uint8_t halfHV[64];\
1897
    copy_block9(full, src, 16, stride, 9);\
1898
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1900
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1902
}\
1903
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1904
    uint8_t full[16*9];\
1905
    uint8_t halfH[72];\
1906
    uint8_t halfV[64];\
1907
    uint8_t halfHV[64];\
1908
    copy_block9(full, src, 16, stride, 9);\
1909
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1910
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1911
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1912
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1913
}\
1914
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1915
    uint8_t full[16*9];\
1916
    uint8_t halfH[72];\
1917
    uint8_t halfHV[64];\
1918
    copy_block9(full, src, 16, stride, 9);\
1919
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1921
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1923
}\
1924
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1925
    uint8_t full[16*9];\
1926
    uint8_t halfH[72];\
1927
    uint8_t halfV[64];\
1928
    uint8_t halfHV[64];\
1929
    copy_block9(full, src, 16, stride, 9);\
1930
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1931
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1932
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1933
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1934
}\
1935
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1936
    uint8_t full[16*9];\
1937
    uint8_t halfH[72];\
1938
    uint8_t halfHV[64];\
1939
    copy_block9(full, src, 16, stride, 9);\
1940
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1942
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1944
}\
1945
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1946
    uint8_t halfH[72];\
1947
    uint8_t halfHV[64];\
1948
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1949
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1951
}\
1952
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1953
    uint8_t halfH[72];\
1954
    uint8_t halfHV[64];\
1955
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1956
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1957
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1958
}\
1959
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1960
    uint8_t full[16*9];\
1961
    uint8_t halfH[72];\
1962
    uint8_t halfV[64];\
1963
    uint8_t halfHV[64];\
1964
    copy_block9(full, src, 16, stride, 9);\
1965
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1966
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1967
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1968
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1969
}\
1970
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1971
    uint8_t full[16*9];\
1972
    uint8_t halfH[72];\
1973
    copy_block9(full, src, 16, stride, 9);\
1974
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1975
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1976
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1977
}\
1978
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979
    uint8_t full[16*9];\
1980
    uint8_t halfH[72];\
1981
    uint8_t halfV[64];\
1982
    uint8_t halfHV[64];\
1983
    copy_block9(full, src, 16, stride, 9);\
1984
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1985
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1986
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1987
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1988
}\
1989
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    uint8_t full[16*9];\
1991
    uint8_t halfH[72];\
1992
    copy_block9(full, src, 16, stride, 9);\
1993
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1995
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1996
}\
1997
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1998
    uint8_t halfH[72];\
1999
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2000
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2001
}\
2002
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2003
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2004
}\
2005
\
2006
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t half[256];\
2008
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2009
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2010
}\
2011
\
2012
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2013
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2014
}\
2015
\
2016
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2017
    uint8_t half[256];\
2018
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2019
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2020
}\
2021
\
2022
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    uint8_t full[24*17];\
2024
    uint8_t half[256];\
2025
    copy_block17(full, src, 24, stride, 17);\
2026
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2027
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2028
}\
2029
\
2030
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2031
    uint8_t full[24*17];\
2032
    copy_block17(full, src, 24, stride, 17);\
2033
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2034
}\
2035
\
2036
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2037
    uint8_t full[24*17];\
2038
    uint8_t half[256];\
2039
    copy_block17(full, src, 24, stride, 17);\
2040
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2041
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2042
}\
2043
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2044
    uint8_t full[24*17];\
2045
    uint8_t halfH[272];\
2046
    uint8_t halfV[256];\
2047
    uint8_t halfHV[256];\
2048
    copy_block17(full, src, 24, stride, 17);\
2049
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2050
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2051
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2052
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2053
}\
2054
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2055
    uint8_t full[24*17];\
2056
    uint8_t halfH[272];\
2057
    uint8_t halfHV[256];\
2058
    copy_block17(full, src, 24, stride, 17);\
2059
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2060
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2061
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2062
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2063
}\
2064
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2065
    uint8_t full[24*17];\
2066
    uint8_t halfH[272];\
2067
    uint8_t halfV[256];\
2068
    uint8_t halfHV[256];\
2069
    copy_block17(full, src, 24, stride, 17);\
2070
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2071
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2072
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2073
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2074
}\
2075
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2076
    uint8_t full[24*17];\
2077
    uint8_t halfH[272];\
2078
    uint8_t halfHV[256];\
2079
    copy_block17(full, src, 24, stride, 17);\
2080
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2082
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2084
}\
2085
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2086
    uint8_t full[24*17];\
2087
    uint8_t halfH[272];\
2088
    uint8_t halfV[256];\
2089
    uint8_t halfHV[256];\
2090
    copy_block17(full, src, 24, stride, 17);\
2091
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2093
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2094
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2095
}\
2096
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2097
    uint8_t full[24*17];\
2098
    uint8_t halfH[272];\
2099
    uint8_t halfHV[256];\
2100
    copy_block17(full, src, 24, stride, 17);\
2101
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2103
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2105
}\
2106
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    uint8_t full[24*17];\
2108
    uint8_t halfH[272];\
2109
    uint8_t halfV[256];\
2110
    uint8_t halfHV[256];\
2111
    copy_block17(full, src, 24, stride, 17);\
2112
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2113
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2114
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2115
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2116
}\
2117
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2118
    uint8_t full[24*17];\
2119
    uint8_t halfH[272];\
2120
    uint8_t halfHV[256];\
2121
    copy_block17(full, src, 24, stride, 17);\
2122
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2124
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2126
}\
2127
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2128
    uint8_t halfH[272];\
2129
    uint8_t halfHV[256];\
2130
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2131
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2133
}\
2134
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2135
    uint8_t halfH[272];\
2136
    uint8_t halfHV[256];\
2137
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2138
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2139
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2140
}\
2141
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2142
    uint8_t full[24*17];\
2143
    uint8_t halfH[272];\
2144
    uint8_t halfV[256];\
2145
    uint8_t halfHV[256];\
2146
    copy_block17(full, src, 24, stride, 17);\
2147
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2148
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2149
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2150
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2151
}\
2152
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2153
    uint8_t full[24*17];\
2154
    uint8_t halfH[272];\
2155
    copy_block17(full, src, 24, stride, 17);\
2156
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2157
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2158
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2159
}\
2160
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2161
    uint8_t full[24*17];\
2162
    uint8_t halfH[272];\
2163
    uint8_t halfV[256];\
2164
    uint8_t halfHV[256];\
2165
    copy_block17(full, src, 24, stride, 17);\
2166
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2167
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2168
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2169
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2170
}\
2171
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2172
    uint8_t full[24*17];\
2173
    uint8_t halfH[272];\
2174
    copy_block17(full, src, 24, stride, 17);\
2175
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2177
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2178
}\
2179
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2180
    uint8_t halfH[272];\
2181
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2182
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2183
}
2184

    
2185
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2186
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2187
#define op_put(a, b) a = cm[((b) + 16)>>5]
2188
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2189

    
2190
QPEL_MC(0, put_       , _       , op_put)
2191
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2192
QPEL_MC(0, avg_       , _       , op_avg)
2193
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2194
#undef op_avg
2195
#undef op_avg_no_rnd
2196
#undef op_put
2197
#undef op_put_no_rnd
2198

    
2199
#if 1
2200
#define H264_LOWPASS(OPNAME, OP, OP2) \
2201
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202
    const int h=2;\
2203
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2204
    int i;\
2205
    for(i=0; i<h; i++)\
2206
    {\
2207
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2208
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2209
        dst+=dstStride;\
2210
        src+=srcStride;\
2211
    }\
2212
}\
2213
\
2214
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2215
    const int w=2;\
2216
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2217
    int i;\
2218
    for(i=0; i<w; i++)\
2219
    {\
2220
        const int srcB= src[-2*srcStride];\
2221
        const int srcA= src[-1*srcStride];\
2222
        const int src0= src[0 *srcStride];\
2223
        const int src1= src[1 *srcStride];\
2224
        const int src2= src[2 *srcStride];\
2225
        const int src3= src[3 *srcStride];\
2226
        const int src4= src[4 *srcStride];\
2227
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2228
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2229
        dst++;\
2230
        src++;\
2231
    }\
2232
}\
2233
\
2234
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2235
    const int h=2;\
2236
    const int w=2;\
2237
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2238
    int i;\
2239
    src -= 2*srcStride;\
2240
    for(i=0; i<h+5; i++)\
2241
    {\
2242
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2243
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2244
        tmp+=tmpStride;\
2245
        src+=srcStride;\
2246
    }\
2247
    tmp -= tmpStride*(h+5-2);\
2248
    for(i=0; i<w; i++)\
2249
    {\
2250
        const int tmpB= tmp[-2*tmpStride];\
2251
        const int tmpA= tmp[-1*tmpStride];\
2252
        const int tmp0= tmp[0 *tmpStride];\
2253
        const int tmp1= tmp[1 *tmpStride];\
2254
        const int tmp2= tmp[2 *tmpStride];\
2255
        const int tmp3= tmp[3 *tmpStride];\
2256
        const int tmp4= tmp[4 *tmpStride];\
2257
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2258
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2259
        dst++;\
2260
        tmp++;\
2261
    }\
2262
}\
2263
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2264
    const int h=4;\
2265
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266
    int i;\
2267
    for(i=0; i<h; i++)\
2268
    {\
2269
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2270
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2271
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2272
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2273
        dst+=dstStride;\
2274
        src+=srcStride;\
2275
    }\
2276
}\
2277
\
2278
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279
    const int w=4;\
2280
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281
    int i;\
2282
    for(i=0; i<w; i++)\
2283
    {\
2284
        const int srcB= src[-2*srcStride];\
2285
        const int srcA= src[-1*srcStride];\
2286
        const int src0= src[0 *srcStride];\
2287
        const int src1= src[1 *srcStride];\
2288
        const int src2= src[2 *srcStride];\
2289
        const int src3= src[3 *srcStride];\
2290
        const int src4= src[4 *srcStride];\
2291
        const int src5= src[5 *srcStride];\
2292
        const int src6= src[6 *srcStride];\
2293
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2294
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2295
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2296
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2297
        dst++;\
2298
        src++;\
2299
    }\
2300
}\
2301
\
2302
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2303
    const int h=4;\
2304
    const int w=4;\
2305
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2306
    int i;\
2307
    src -= 2*srcStride;\
2308
    for(i=0; i<h+5; i++)\
2309
    {\
2310
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2311
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2312
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2313
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2314
        tmp+=tmpStride;\
2315
        src+=srcStride;\
2316
    }\
2317
    tmp -= tmpStride*(h+5-2);\
2318
    for(i=0; i<w; i++)\
2319
    {\
2320
        const int tmpB= tmp[-2*tmpStride];\
2321
        const int tmpA= tmp[-1*tmpStride];\
2322
        const int tmp0= tmp[0 *tmpStride];\
2323
        const int tmp1= tmp[1 *tmpStride];\
2324
        const int tmp2= tmp[2 *tmpStride];\
2325
        const int tmp3= tmp[3 *tmpStride];\
2326
        const int tmp4= tmp[4 *tmpStride];\
2327
        const int tmp5= tmp[5 *tmpStride];\
2328
        const int tmp6= tmp[6 *tmpStride];\
2329
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2330
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2331
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2332
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2333
        dst++;\
2334
        tmp++;\
2335
    }\
2336
}\
2337
\
2338
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339
    const int h=8;\
2340
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2341
    int i;\
2342
    for(i=0; i<h; i++)\
2343
    {\
2344
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2345
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2346
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2347
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2348
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2349
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2350
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2351
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2352
        dst+=dstStride;\
2353
        src+=srcStride;\
2354
    }\
2355
}\
2356
\
2357
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2358
    const int w=8;\
2359
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2360
    int i;\
2361
    for(i=0; i<w; i++)\
2362
    {\
2363
        const int srcB= src[-2*srcStride];\
2364
        const int srcA= src[-1*srcStride];\
2365
        const int src0= src[0 *srcStride];\
2366
        const int src1= src[1 *srcStride];\
2367
        const int src2= src[2 *srcStride];\
2368
        const int src3= src[3 *srcStride];\
2369
        const int src4= src[4 *srcStride];\
2370
        const int src5= src[5 *srcStride];\
2371
        const int src6= src[6 *srcStride];\
2372
        const int src7= src[7 *srcStride];\
2373
        const int src8= src[8 *srcStride];\
2374
        const int src9= src[9 *srcStride];\
2375
        const int src10=src[10*srcStride];\
2376
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2377
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2378
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2379
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2380
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2381
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2382
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2383
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2384
        dst++;\
2385
        src++;\
2386
    }\
2387
}\
2388
\
2389
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2390
    const int h=8;\
2391
    const int w=8;\
2392
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2393
    int i;\
2394
    src -= 2*srcStride;\
2395
    for(i=0; i<h+5; i++)\
2396
    {\
2397
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2398
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2399
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2400
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2401
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2402
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2403
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2404
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2405
        tmp+=tmpStride;\
2406
        src+=srcStride;\
2407
    }\
2408
    tmp -= tmpStride*(h+5-2);\
2409
    for(i=0; i<w; i++)\
2410
    {\
2411
        const int tmpB= tmp[-2*tmpStride];\
2412
        const int tmpA= tmp[-1*tmpStride];\
2413
        const int tmp0= tmp[0 *tmpStride];\
2414
        const int tmp1= tmp[1 *tmpStride];\
2415
        const int tmp2= tmp[2 *tmpStride];\
2416
        const int tmp3= tmp[3 *tmpStride];\
2417
        const int tmp4= tmp[4 *tmpStride];\
2418
        const int tmp5= tmp[5 *tmpStride];\
2419
        const int tmp6= tmp[6 *tmpStride];\
2420
        const int tmp7= tmp[7 *tmpStride];\
2421
        const int tmp8= tmp[8 *tmpStride];\
2422
        const int tmp9= tmp[9 *tmpStride];\
2423
        const int tmp10=tmp[10*tmpStride];\
2424
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2425
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2426
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2427
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2428
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2429
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2430
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2431
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2432
        dst++;\
2433
        tmp++;\
2434
    }\
2435
}\
2436
\
2437
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2438
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2439
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2440
    src += 8*srcStride;\
2441
    dst += 8*dstStride;\
2442
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2443
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444
}\
2445
\
2446
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2447
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2448
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2449
    src += 8*srcStride;\
2450
    dst += 8*dstStride;\
2451
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2452
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453
}\
2454
\
2455
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2456
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2457
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2458
    src += 8*srcStride;\
2459
    dst += 8*dstStride;\
2460
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2461
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462
}\
2463

    
2464
#define H264_MC(OPNAME, SIZE) \
2465
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2466
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2467
}\
2468
\
2469
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2470
    uint8_t half[SIZE*SIZE];\
2471
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2472
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2473
}\
2474
\
2475
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2476
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2477
}\
2478
\
2479
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2480
    uint8_t half[SIZE*SIZE];\
2481
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2482
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2483
}\
2484
\
2485
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2486
    uint8_t full[SIZE*(SIZE+5)];\
2487
    uint8_t * const full_mid= full + SIZE*2;\
2488
    uint8_t half[SIZE*SIZE];\
2489
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2490
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2491
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2492
}\
2493
\
2494
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2495
    uint8_t full[SIZE*(SIZE+5)];\
2496
    uint8_t * const full_mid= full + SIZE*2;\
2497
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2498
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2499
}\
2500
\
2501
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2502
    uint8_t full[SIZE*(SIZE+5)];\
2503
    uint8_t * const full_mid= full + SIZE*2;\
2504
    uint8_t half[SIZE*SIZE];\
2505
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2506
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2507
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2508
}\
2509
\
2510
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2511
    uint8_t full[SIZE*(SIZE+5)];\
2512
    uint8_t * const full_mid= full + SIZE*2;\
2513
    uint8_t halfH[SIZE*SIZE];\
2514
    uint8_t halfV[SIZE*SIZE];\
2515
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2516
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2517
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2518
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2519
}\
2520
\
2521
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2522
    uint8_t full[SIZE*(SIZE+5)];\
2523
    uint8_t * const full_mid= full + SIZE*2;\
2524
    uint8_t halfH[SIZE*SIZE];\
2525
    uint8_t halfV[SIZE*SIZE];\
2526
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2527
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2528
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2530
}\
2531
\
2532
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2533
    uint8_t full[SIZE*(SIZE+5)];\
2534
    uint8_t * const full_mid= full + SIZE*2;\
2535
    uint8_t halfH[SIZE*SIZE];\
2536
    uint8_t halfV[SIZE*SIZE];\
2537
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2538
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2539
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2540
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2541
}\
2542
\
2543
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2544
    uint8_t full[SIZE*(SIZE+5)];\
2545
    uint8_t * const full_mid= full + SIZE*2;\
2546
    uint8_t halfH[SIZE*SIZE];\
2547
    uint8_t halfV[SIZE*SIZE];\
2548
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2549
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2550
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2551
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2552
}\
2553
\
2554
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2555
    int16_t tmp[SIZE*(SIZE+5)];\
2556
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2557
}\
2558
\
2559
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2560
    int16_t tmp[SIZE*(SIZE+5)];\
2561
    uint8_t halfH[SIZE*SIZE];\
2562
    uint8_t halfHV[SIZE*SIZE];\
2563
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2564
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2565
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2566
}\
2567
\
2568
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2569
    int16_t tmp[SIZE*(SIZE+5)];\
2570
    uint8_t halfH[SIZE*SIZE];\
2571
    uint8_t halfHV[SIZE*SIZE];\
2572
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2573
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2574
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2575
}\
2576
\
2577
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2578
    uint8_t full[SIZE*(SIZE+5)];\
2579
    uint8_t * const full_mid= full + SIZE*2;\
2580
    int16_t tmp[SIZE*(SIZE+5)];\
2581
    uint8_t halfV[SIZE*SIZE];\
2582
    uint8_t halfHV[SIZE*SIZE];\
2583
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2584
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2585
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2586
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2587
}\
2588
\
2589
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2590
    uint8_t full[SIZE*(SIZE+5)];\
2591
    uint8_t * const full_mid= full + SIZE*2;\
2592
    int16_t tmp[SIZE*(SIZE+5)];\
2593
    uint8_t halfV[SIZE*SIZE];\
2594
    uint8_t halfHV[SIZE*SIZE];\
2595
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2596
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2597
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2598
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2599
}\
2600

    
2601
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2602
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2603
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2604
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2605
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2606

    
2607
H264_LOWPASS(put_       , op_put, op2_put)
2608
H264_LOWPASS(avg_       , op_avg, op2_avg)
2609
H264_MC(put_, 2)
2610
H264_MC(put_, 4)
2611
H264_MC(put_, 8)
2612
H264_MC(put_, 16)
2613
H264_MC(avg_, 4)
2614
H264_MC(avg_, 8)
2615
H264_MC(avg_, 16)
2616

    
2617
#undef op_avg
2618
#undef op_put
2619
#undef op2_avg
2620
#undef op2_put
2621
#endif
2622

    
2623
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2624
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2625
#define H264_WEIGHT(W,H) \
2626
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2627
    int y; \
2628
    offset <<= log2_denom; \
2629
    if(log2_denom) offset += 1<<(log2_denom-1); \
2630
    for(y=0; y<H; y++, block += stride){ \
2631
        op_scale1(0); \
2632
        op_scale1(1); \
2633
        if(W==2) continue; \
2634
        op_scale1(2); \
2635
        op_scale1(3); \
2636
        if(W==4) continue; \
2637
        op_scale1(4); \
2638
        op_scale1(5); \
2639
        op_scale1(6); \
2640
        op_scale1(7); \
2641
        if(W==8) continue; \
2642
        op_scale1(8); \
2643
        op_scale1(9); \
2644
        op_scale1(10); \
2645
        op_scale1(11); \
2646
        op_scale1(12); \
2647
        op_scale1(13); \
2648
        op_scale1(14); \
2649
        op_scale1(15); \
2650
    } \
2651
} \
2652
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2653
    int y; \
2654
    offset = ((offset + 1) | 1) << log2_denom; \
2655
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2656
        op_scale2(0); \
2657
        op_scale2(1); \
2658
        if(W==2) continue; \
2659
        op_scale2(2); \
2660
        op_scale2(3); \
2661
        if(W==4) continue; \
2662
        op_scale2(4); \
2663
        op_scale2(5); \
2664
        op_scale2(6); \
2665
        op_scale2(7); \
2666
        if(W==8) continue; \
2667
        op_scale2(8); \
2668
        op_scale2(9); \
2669
        op_scale2(10); \
2670
        op_scale2(11); \
2671
        op_scale2(12); \
2672
        op_scale2(13); \
2673
        op_scale2(14); \
2674
        op_scale2(15); \
2675
    } \
2676
}
2677

    
2678
H264_WEIGHT(16,16)
2679
H264_WEIGHT(16,8)
2680
H264_WEIGHT(8,16)
2681
H264_WEIGHT(8,8)
2682
H264_WEIGHT(8,4)
2683
H264_WEIGHT(4,8)
2684
H264_WEIGHT(4,4)
2685
H264_WEIGHT(4,2)
2686
H264_WEIGHT(2,4)
2687
H264_WEIGHT(2,2)
2688

    
2689
#undef op_scale1
2690
#undef op_scale2
2691
#undef H264_WEIGHT
2692

    
2693
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2694
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2695
    int i;
2696

    
2697
    for(i=0; i<h; i++){
2698
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2699
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2700
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2701
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2702
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2703
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2704
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2705
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2706
        dst+=dstStride;
2707
        src+=srcStride;
2708
    }
2709
}
2710

    
2711
#ifdef CONFIG_CAVS_DECODER
2712
/* AVS specific */
2713
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2714

    
2715
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2716
    put_pixels8_c(dst, src, stride, 8);
2717
}
2718
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719
    avg_pixels8_c(dst, src, stride, 8);
2720
}
2721
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722
    put_pixels16_c(dst, src, stride, 16);
2723
}
2724
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725
    avg_pixels16_c(dst, src, stride, 16);
2726
}
2727
#endif /* CONFIG_CAVS_DECODER */
2728

    
2729
#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2730
/* VC-1 specific */
2731
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2732

    
2733
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2734
    put_pixels8_c(dst, src, stride, 8);
2735
}
2736
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2737

    
2738
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2739

    
2740
/* H264 specific */
2741
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2742

    
2743
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2744
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2745
    int i;
2746

    
2747
    for(i=0; i<w; i++){
2748
        const int src_1= src[ -srcStride];
2749
        const int src0 = src[0          ];
2750
        const int src1 = src[  srcStride];
2751
        const int src2 = src[2*srcStride];
2752
        const int src3 = src[3*srcStride];
2753
        const int src4 = src[4*srcStride];
2754
        const int src5 = src[5*srcStride];
2755
        const int src6 = src[6*srcStride];
2756
        const int src7 = src[7*srcStride];
2757
        const int src8 = src[8*srcStride];
2758
        const int src9 = src[9*srcStride];
2759
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2760
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2761
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2762
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2763
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2764
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2765
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2766
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2767
        src++;
2768
        dst++;
2769
    }
2770
}
2771

    
2772
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2773
    put_pixels8_c(dst, src, stride, 8);
2774
}
2775

    
2776
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2777
    uint8_t half[64];
2778
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2779
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2780
}
2781

    
2782
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2783
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2784
}
2785

    
2786
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2787
    uint8_t half[64];
2788
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2789
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2790
}
2791

    
2792
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2793
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2794
}
2795

    
2796
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2797
    uint8_t halfH[88];
2798
    uint8_t halfV[64];
2799
    uint8_t halfHV[64];
2800
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2801
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2802
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2803
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2804
}
2805
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2806
    uint8_t halfH[88];
2807
    uint8_t halfV[64];
2808
    uint8_t halfHV[64];
2809
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2810
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2811
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2812
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2813
}
2814
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2815
    uint8_t halfH[88];
2816
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2817
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2818
}
2819

    
2820
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2821
    if(ENABLE_ANY_H263) {
2822
    int x;
2823
    const int strength= ff_h263_loop_filter_strength[qscale];
2824

    
2825
    for(x=0; x<8; x++){
2826
        int d1, d2, ad1;
2827
        int p0= src[x-2*stride];
2828
        int p1= src[x-1*stride];
2829
        int p2= src[x+0*stride];
2830
        int p3= src[x+1*stride];
2831
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2832

    
2833
        if     (d<-2*strength) d1= 0;
2834
        else if(d<-  strength) d1=-2*strength - d;
2835
        else if(d<   strength) d1= d;
2836
        else if(d< 2*strength) d1= 2*strength - d;
2837
        else                   d1= 0;
2838

    
2839
        p1 += d1;
2840
        p2 -= d1;
2841
        if(p1&256) p1= ~(p1>>31);
2842
        if(p2&256) p2= ~(p2>>31);
2843

    
2844
        src[x-1*stride] = p1;
2845
        src[x+0*stride] = p2;
2846

    
2847
        ad1= FFABS(d1)>>1;
2848

    
2849
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2850

    
2851
        src[x-2*stride] = p0 - d2;
2852
        src[x+  stride] = p3 + d2;
2853
    }
2854
    }
2855
}
2856

    
2857
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2858
    if(ENABLE_ANY_H263) {
2859
    int y;
2860
    const int strength= ff_h263_loop_filter_strength[qscale];
2861

    
2862
    for(y=0; y<8; y++){
2863
        int d1, d2, ad1;
2864
        int p0= src[y*stride-2];
2865
        int p1= src[y*stride-1];
2866
        int p2= src[y*stride+0];
2867
        int p3= src[y*stride+1];
2868
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2869

    
2870
        if     (d<-2*strength) d1= 0;
2871
        else if(d<-  strength) d1=-2*strength - d;
2872
        else if(d<   strength) d1= d;
2873
        else if(d< 2*strength) d1= 2*strength - d;
2874
        else                   d1= 0;
2875

    
2876
        p1 += d1;
2877
        p2 -= d1;
2878
        if(p1&256) p1= ~(p1>>31);
2879
        if(p2&256) p2= ~(p2>>31);
2880

    
2881
        src[y*stride-1] = p1;
2882
        src[y*stride+0] = p2;
2883

    
2884
        ad1= FFABS(d1)>>1;
2885

    
2886
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2887

    
2888
        src[y*stride-2] = p0 - d2;
2889
        src[y*stride+1] = p3 + d2;
2890
    }
2891
    }
2892
}
2893

    
2894
static void h261_loop_filter_c(uint8_t *src, int stride){
2895
    int x,y,xy,yz;
2896
    int temp[64];
2897

    
2898
    for(x=0; x<8; x++){
2899
        temp[x      ] = 4*src[x           ];
2900
        temp[x + 7*8] = 4*src[x + 7*stride];
2901
    }
2902
    for(y=1; y<7; y++){
2903
        for(x=0; x<8; x++){
2904
            xy = y * stride + x;
2905
            yz = y * 8 + x;
2906
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2907
        }
2908
    }
2909

    
2910
    for(y=0; y<8; y++){
2911
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2912
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2913
        for(x=1; x<7; x++){
2914
            xy = y * stride + x;
2915
            yz = y * 8 + x;
2916
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2917
        }
2918
    }
2919
}
2920

    
2921
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2922
{
2923
    int i, d;
2924
    for( i = 0; i < 4; i++ ) {
2925
        if( tc0[i] < 0 ) {
2926
            pix += 4*ystride;
2927
            continue;
2928
        }
2929
        for( d = 0; d < 4; d++ ) {
2930
            const int p0 = pix[-1*xstride];
2931
            const int p1 = pix[-2*xstride];
2932
            const int p2 = pix[-3*xstride];
2933
            const int q0 = pix[0];
2934
            const int q1 = pix[1*xstride];
2935
            const int q2 = pix[2*xstride];
2936

    
2937
            if( FFABS( p0 - q0 ) < alpha &&
2938
                FFABS( p1 - p0 ) < beta &&
2939
                FFABS( q1 - q0 ) < beta ) {
2940

    
2941
                int tc = tc0[i];
2942
                int i_delta;
2943

    
2944
                if( FFABS( p2 - p0 ) < beta ) {
2945
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2946
                    tc++;
2947
                }
2948
                if( FFABS( q2 - q0 ) < beta ) {
2949
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2950
                    tc++;
2951
                }
2952

    
2953
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2954
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2955
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2956
            }
2957
            pix += ystride;
2958
        }
2959
    }
2960
}
2961
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2962
{
2963
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2964
}
2965
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2966
{
2967
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2968
}
2969

    
2970
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2971
{
2972
    int i, d;
2973
    for( i = 0; i < 4; i++ ) {
2974
        const int tc = tc0[i];
2975
        if( tc <= 0 ) {
2976
            pix += 2*ystride;
2977
            continue;
2978
        }
2979
        for( d = 0; d < 2; d++ ) {
2980
            const int p0 = pix[-1*xstride];
2981
            const int p1 = pix[-2*xstride];
2982
            const int q0 = pix[0];
2983
            const int q1 = pix[1*xstride];
2984

    
2985
            if( FFABS( p0 - q0 ) < alpha &&
2986
                FFABS( p1 - p0 ) < beta &&
2987
                FFABS( q1 - q0 ) < beta ) {
2988

    
2989
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2990

    
2991
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2992
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2993
            }
2994
            pix += ystride;
2995
        }
2996
    }
2997
}
2998
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2999
{
3000
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3001
}
3002
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3003
{
3004
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3005
}
3006

    
3007
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3008
{
3009
    int d;
3010
    for( d = 0; d < 8; d++ ) {
3011
        const int p0 = pix[-1*xstride];
3012
        const int p1 = pix[-2*xstride];
3013
        const int q0 = pix[0];
3014
        const int q1 = pix[1*xstride];
3015

    
3016
        if( FFABS( p0 - q0 ) < alpha &&
3017
            FFABS( p1 - p0 ) < beta &&
3018
            FFABS( q1 - q0 ) < beta ) {
3019

    
3020
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3021
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3022
        }
3023
        pix += ystride;
3024
    }
3025
}
3026
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3027
{
3028
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3029
}
3030
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3031
{
3032
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3033
}
3034

    
3035
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036
{
3037
    int s, i;
3038

    
3039
    s = 0;
3040
    for(i=0;i<h;i++) {
3041
        s += abs(pix1[0] - pix2[0]);
3042
        s += abs(pix1[1] - pix2[1]);
3043
        s += abs(pix1[2] - pix2[2]);
3044
        s += abs(pix1[3] - pix2[3]);
3045
        s += abs(pix1[4] - pix2[4]);
3046
        s += abs(pix1[5] - pix2[5]);
3047
        s += abs(pix1[6] - pix2[6]);
3048
        s += abs(pix1[7] - pix2[7]);
3049
        s += abs(pix1[8] - pix2[8]);
3050
        s += abs(pix1[9] - pix2[9]);
3051
        s += abs(pix1[10] - pix2[10]);
3052
        s += abs(pix1[11] - pix2[11]);
3053
        s += abs(pix1[12] - pix2[12]);
3054
        s += abs(pix1[13] - pix2[13]);
3055
        s += abs(pix1[14] - pix2[14]);
3056
        s += abs(pix1[15] - pix2[15]);
3057
        pix1 += line_size;
3058
        pix2 += line_size;
3059
    }
3060
    return s;
3061
}
3062

    
3063
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3064
{
3065
    int s, i;
3066

    
3067
    s = 0;
3068
    for(i=0;i<h;i++) {
3069
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3070
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3071
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3072
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3073
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3074
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3075
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3076
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3077
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3078
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3079
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3080
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3081
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3082
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3083
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3084
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3085
        pix1 += line_size;
3086
        pix2 += line_size;
3087
    }
3088
    return s;
3089
}
3090

    
3091
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3092
{
3093
    int s, i;
3094
    uint8_t *pix3 = pix2 + line_size;
3095

    
3096
    s = 0;
3097
    for(i=0;i<h;i++) {
3098
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3099
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3100
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3101
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3102
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3103
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3104
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3105
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3106
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3107
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3108
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3109
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3110
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3111
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3112
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3113
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3114
        pix1 += line_size;
3115
        pix2 += line_size;
3116
        pix3 += line_size;
3117
    }
3118
    return s;
3119
}
3120

    
3121
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3122
{
3123
    int s, i;
3124
    uint8_t *pix3 = pix2 + line_size;
3125

    
3126
    s = 0;
3127
    for(i=0;i<h;i++) {
3128
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3129
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3130
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3131
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3132
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3133
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3134
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3135
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3136
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3137
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3138
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3139
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3140
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3141
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3142
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3143
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3144
        pix1 += line_size;
3145
        pix2 += line_size;
3146
        pix3 += line_size;
3147
    }
3148
    return s;
3149
}
3150

    
3151
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3152
{
3153
    int s, i;
3154

    
3155
    s = 0;
3156
    for(i=0;i<h;i++) {
3157
        s += abs(pix1[0] - pix2[0]);
3158
        s += abs(pix1[1] - pix2[1]);
3159
        s += abs(pix1[2] - pix2[2]);
3160
        s += abs(pix1[3] - pix2[3]);
3161
        s += abs(pix1[4] - pix2[4]);
3162
        s += abs(pix1[5] - pix2[5]);
3163
        s += abs(pix1[6] - pix2[6]);
3164
        s += abs(pix1[7] - pix2[7]);
3165
        pix1 += line_size;
3166
        pix2 += line_size;
3167
    }
3168
    return s;
3169
}
3170

    
3171
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3172
{
3173
    int s, i;
3174

    
3175
    s = 0;
3176
    for(i=0;i<h;i++) {
3177
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3178
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3179
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3180
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3181
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3182
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3183
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3184
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3185
        pix1 += line_size;
3186
        pix2 += line_size;
3187
    }
3188
    return s;
3189
}
3190

    
3191
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3192
{
3193
    int s, i;
3194
    uint8_t *pix3 = pix2 + line_size;
3195

    
3196
    s = 0;
3197
    for(i=0;i<h;i++) {
3198
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3199
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3200
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3201
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3202
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3203
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3204
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3205
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3206
        pix1 += line_size;
3207
        pix2 += line_size;
3208
        pix3 += line_size;
3209
    }
3210
    return s;
3211
}
3212

    
3213
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3214
{
3215
    int s, i;
3216
    uint8_t *pix3 = pix2 + line_size;
3217

    
3218
    s = 0;
3219
    for(i=0;i<h;i++) {
3220
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3221
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3222
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3223
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3224
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3225
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3226
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3227
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3228
        pix1 += line_size;
3229
        pix2 += line_size;
3230
        pix3 += line_size;
3231
    }
3232
    return s;
3233
}
3234

    
3235
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3236
    MpegEncContext *c = v;
3237
    int score1=0;
3238
    int score2=0;
3239
    int x,y;
3240

    
3241
    for(y=0; y<h; y++){
3242
        for(x=0; x<16; x++){
3243
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3244
        }
3245
        if(y+1<h){
3246
            for(x=0; x<15; x++){
3247
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3248
                             - s1[x+1] + s1[x+1+stride])
3249
                        -FFABS(  s2[x  ] - s2[x  +stride]
3250
                             - s2[x+1] + s2[x+1+stride]);
3251
            }
3252
        }
3253
        s1+= stride;
3254
        s2+= stride;
3255
    }
3256

    
3257
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3258
    else  return score1 + FFABS(score2)*8;
3259
}
3260

    
3261
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3262
    MpegEncContext *c = v;
3263
    int score1=0;
3264
    int score2=0;
3265
    int x,y;
3266

    
3267
    for(y=0; y<h; y++){
3268
        for(x=0; x<8; x++){
3269
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3270
        }
3271
        if(y+1<h){
3272
            for(x=0; x<7; x++){
3273
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3274
                             - s1[x+1] + s1[x+1+stride])
3275
                        -FFABS(  s2[x  ] - s2[x  +stride]
3276
                             - s2[x+1] + s2[x+1+stride]);
3277
            }
3278
        }
3279
        s1+= stride;
3280
        s2+= stride;
3281
    }
3282

    
3283
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3284
    else  return score1 + FFABS(score2)*8;
3285
}
3286

    
3287
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3288
    int i;
3289
    unsigned int sum=0;
3290

    
3291
    for(i=0; i<8*8; i++){
3292
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3293
        int w= weight[i];
3294
        b>>= RECON_SHIFT;
3295
        assert(-512<b && b<512);
3296

    
3297
        sum += (w*b)*(w*b)>>4;
3298
    }
3299
    return sum>>2;
3300
}
3301

    
3302
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3303
    int i;
3304

    
3305
    for(i=0; i<8*8; i++){
3306
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3307
    }
3308
}
3309

    
3310
/**
3311
 * permutes an 8x8 block.
3312
 * @param block the block which will be permuted according to the given permutation vector
3313
 * @param permutation the permutation vector
3314
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3315
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3316
 *                  (inverse) permutated to scantable order!
3317
 */
3318
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3319
{
3320
    int i;
3321
    DCTELEM temp[64];
3322

    
3323
    if(last<=0) return;
3324
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3325

    
3326
    for(i=0; i<=last; i++){
3327
        const int j= scantable[i];
3328
        temp[j]= block[j];
3329
        block[j]=0;
3330
    }
3331

    
3332
    for(i=0; i<=last; i++){
3333
        const int j= scantable[i];
3334
        const int perm_j= permutation[j];
3335
        block[perm_j]= temp[j];
3336
    }
3337
}
3338

    
3339
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3340
    return 0;
3341
}
3342

    
3343
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3344
    int i;
3345

    
3346
    memset(cmp, 0, sizeof(void*)*5);
3347

    
3348
    for(i=0; i<5; i++){
3349
        switch(type&0xFF){
3350
        case FF_CMP_SAD:
3351
            cmp[i]= c->sad[i];
3352
            break;
3353
        case FF_CMP_SATD:
3354
            cmp[i]= c->hadamard8_diff[i];
3355
            break;
3356
        case FF_CMP_SSE:
3357
            cmp[i]= c->sse[i];
3358
            break;
3359
        case FF_CMP_DCT:
3360
            cmp[i]= c->dct_sad[i];
3361
            break;
3362
        case FF_CMP_DCT264:
3363
            cmp[i]= c->dct264_sad[i];
3364
            break;
3365
        case FF_CMP_DCTMAX:
3366
            cmp[i]= c->dct_max[i];
3367
            break;
3368
        case FF_CMP_PSNR:
3369
            cmp[i]= c->quant_psnr[i];
3370
            break;
3371
        case FF_CMP_BIT:
3372
            cmp[i]= c->bit[i];
3373
            break;
3374
        case FF_CMP_RD:
3375
            cmp[i]= c->rd[i];
3376
            break;
3377
        case FF_CMP_VSAD:
3378
            cmp[i]= c->vsad[i];
3379
            break;
3380
        case FF_CMP_VSSE:
3381
            cmp[i]= c->vsse[i];
3382
            break;
3383
        case FF_CMP_ZERO:
3384
            cmp[i]= zero_cmp;
3385
            break;
3386
        case FF_CMP_NSSE:
3387
            cmp[i]= c->nsse[i];
3388
            break;
3389
#ifdef CONFIG_SNOW_ENCODER
3390
        case FF_CMP_W53:
3391
            cmp[i]= c->w53[i];
3392
            break;
3393
        case FF_CMP_W97:
3394
            cmp[i]= c->w97[i];
3395
            break;
3396
#endif
3397
        default:
3398
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3399
        }
3400
    }
3401
}
3402

    
3403
/**
3404
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3405
 */
3406
static void clear_blocks_c(DCTELEM *blocks)
3407
{
3408
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3409
}
3410

    
3411
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3412
    long i;
3413
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3414
        long a = *(long*)(src+i);
3415
        long b = *(long*)(dst+i);
3416
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3417
    }
3418
    for(; i<w; i++)
3419
        dst[i+0] += src[i+0];
3420
}
3421

    
3422
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3423
    long i;
3424
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3425
        long a = *(long*)(src1+i);
3426
        long b = *(long*)(src2+i);
3427
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3428
    }
3429
    for(; i<w; i++)
3430
        dst[i] = src1[i]+src2[i];
3431
}
3432

    
3433
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3434
    long i;
3435
#ifndef HAVE_FAST_UNALIGNED
3436
    if((long)src2 & (sizeof(long)-1)){
3437
        for(i=0; i+7<w; i+=8){
3438
            dst[i+0] = src1[i+0]-src2[i+0];
3439
            dst[i+1] = src1[i+1]-src2[i+1];
3440
            dst[i+2] = src1[i+2]-src2[i+2];
3441
            dst[i+3] = src1[i+3]-src2[i+3];
3442
            dst[i+4] = src1[i+4]-src2[i+4];
3443
            dst[i+5] = src1[i+5]-src2[i+5];
3444
            dst[i+6] = src1[i+6]-src2[i+6];
3445
            dst[i+7] = src1[i+7]-src2[i+7];
3446
        }
3447
    }else
3448
#endif
3449
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3450
        long a = *(long*)(src1+i);
3451
        long b = *(long*)(src2+i);
3452
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3453
    }
3454
    for(; i<w; i++)
3455
        dst[i+0] = src1[i+0]-src2[i+0];
3456
}
3457

    
3458
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3459
    int i;
3460
    uint8_t l, lt;
3461

    
3462
    l= *left;
3463
    lt= *left_top;
3464

    
3465
    for(i=0; i<w; i++){
3466
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3467
        lt= src1[i];
3468
        l= src2[i];
3469
        dst[i]= l - pred;
3470
    }
3471

    
3472
    *left= l;
3473
    *left_top= lt;
3474
}
3475

    
3476
#define BUTTERFLY2(o1,o2,i1,i2) \
3477
o1= (i1)+(i2);\
3478
o2= (i1)-(i2);
3479

    
3480
#define BUTTERFLY1(x,y) \
3481
{\
3482
    int a,b;\
3483
    a= x;\
3484
    b= y;\
3485
    x= a+b;\
3486
    y= a-b;\
3487
}
3488

    
3489
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3490

    
3491
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3492
    int i;
3493
    int temp[64];
3494
    int sum=0;
3495

    
3496
    assert(h==8);
3497

    
3498
    for(i=0; i<8; i++){
3499
        //FIXME try pointer walks
3500
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3501
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3502
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3503
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3504

    
3505
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3506
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3507
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3508
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3509

    
3510
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3511
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3512
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3513
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3514
    }
3515

    
3516
    for(i=0; i<8; i++){
3517
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3518
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3519
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3520
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3521

    
3522
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3523
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3524
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3525
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3526

    
3527
        sum +=
3528
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3529
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3530
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3531
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3532
    }
3533
#if 0
3534
static int maxi=0;
3535
if(sum>maxi){
3536
    maxi=sum;
3537
    printf("MAX:%d\n", maxi);
3538
}
3539
#endif
3540
    return sum;
3541
}
3542

    
3543
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3544
    int i;
3545
    int temp[64];
3546
    int sum=0;
3547

    
3548
    assert(h==8);
3549

    
3550
    for(i=0; i<8; i++){
3551
        //FIXME try pointer walks
3552
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3553
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3554
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3555
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3556

    
3557
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3558
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3559
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3560
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3561

    
3562
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3563
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3564
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3565
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3566
    }
3567

    
3568
    for(i=0; i<8; i++){
3569
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3570
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3571
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3572
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3573

    
3574
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3575
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3576
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3577
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3578

    
3579
        sum +=
3580
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3581
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3582
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3583
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3584
    }
3585

    
3586
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3587

    
3588
    return sum;
3589
}
3590

    
3591
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3592
    MpegEncContext * const s= (MpegEncContext *)c;
3593
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3594
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3595

    
3596
    assert(h==8);
3597

    
3598
    s->dsp.diff_pixels(temp, src1, src2, stride);
3599
    s->dsp.fdct(temp);
3600
    return s->dsp.sum_abs_dctelem(temp);
3601
}
3602

    
3603
#ifdef CONFIG_GPL
3604
#define DCT8_1D {\
3605
    const int s07 = SRC(0) + SRC(7);\
3606
    const int s16 = SRC(1) + SRC(6);\
3607
    const int s25 = SRC(2) + SRC(5);\
3608
    const int s34 = SRC(3) + SRC(4);\
3609
    const int a0 = s07 + s34;\
3610
    const int a1 = s16 + s25;\
3611
    const int a2 = s07 - s34;\
3612
    const int a3 = s16 - s25;\
3613
    const int d07 = SRC(0) - SRC(7);\
3614
    const int d16 = SRC(1) - SRC(6);\
3615
    const int d25 = SRC(2) - SRC(5);\
3616
    const int d34 = SRC(3) - SRC(4);\
3617
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3618
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3619
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3620
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3621
    DST(0,  a0 + a1     ) ;\
3622
    DST(1,  a4 + (a7>>2)) ;\
3623
    DST(2,  a2 + (a3>>1)) ;\
3624
    DST(3,  a5 + (a6>>2)) ;\
3625
    DST(4,  a0 - a1     ) ;\
3626
    DST(5,  a6 - (a5>>2)) ;\
3627
    DST(6, (a2>>1) - a3 ) ;\
3628
    DST(7, (a4>>2) - a7 ) ;\
3629
}
3630

    
3631
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3632
    MpegEncContext * const s= (MpegEncContext *)c;
3633
    DCTELEM dct[8][8];
3634
    int i;
3635
    int sum=0;
3636

    
3637
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3638

    
3639
#define SRC(x) dct[i][x]
3640
#define DST(x,v) dct[i][x]= v
3641
    for( i = 0; i < 8; i++ )
3642
        DCT8_1D
3643
#undef SRC
3644
#undef DST
3645

    
3646
#define SRC(x) dct[x][i]
3647
#define DST(x,v) sum += FFABS(v)
3648
    for( i = 0; i < 8; i++ )
3649
        DCT8_1D
3650
#undef SRC
3651
#undef DST
3652
    return sum;
3653
}
3654
#endif
3655

    
3656
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3657
    MpegEncContext * const s= (MpegEncContext *)c;
3658
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3659
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3660
    int sum=0, i;
3661

    
3662
    assert(h==8);
3663

    
3664
    s->dsp.diff_pixels(temp, src1, src2, stride);
3665
    s->dsp.fdct(temp);
3666

    
3667
    for(i=0; i<64; i++)
3668
        sum= FFMAX(sum, FFABS(temp[i]));
3669

    
3670
    return sum;
3671
}
3672

    
3673
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3674
    MpegEncContext * const s= (MpegEncContext *)c;
3675
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3676
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3677
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3678
    int sum=0, i;
3679

    
3680
    assert(h==8);
3681
    s->mb_intra=0;
3682

    
3683
    s->dsp.diff_pixels(temp, src1, src2, stride);
3684

    
3685
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3686

    
3687
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3688
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3689
    ff_simple_idct(temp); //FIXME
3690

    
3691
    for(i=0; i<64; i++)
3692
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3693

    
3694
    return sum;
3695
}
3696

    
3697
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3698
    MpegEncContext * const s= (MpegEncContext *)c;
3699
    const uint8_t *scantable= s->intra_scantable.permutated;
3700
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3701
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3702
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3703
    uint8_t * const bak= (uint8_t*)aligned_bak;
3704
    int i, last, run, bits, level, distortion, start_i;
3705
    const int esc_length= s->ac_esc_length;
3706
    uint8_t * length;
3707
    uint8_t * last_length;
3708

    
3709
    assert(h==8);
3710

    
3711
    for(i=0; i<8; i++){
3712
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3713
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3714
    }
3715

    
3716
    s->dsp.diff_pixels(temp, src1, src2, stride);
3717

    
3718
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3719

    
3720
    bits=0;
3721

    
3722
    if (s->mb_intra) {
3723
        start_i = 1;
3724
        length     = s->intra_ac_vlc_length;
3725
        last_length= s->intra_ac_vlc_last_length;
3726
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3727
    } else {
3728
        start_i = 0;
3729
        length     = s->inter_ac_vlc_length;
3730
        last_length= s->inter_ac_vlc_last_length;
3731
    }
3732

    
3733
    if(last>=start_i){
3734
        run=0;
3735
        for(i=start_i; i<last; i++){
3736
            int j= scantable[i];
3737
            level= temp[j];
3738

    
3739
            if(level){
3740
                level+=64;
3741
                if((level&(~127)) == 0){
3742
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3743
                }else
3744
                    bits+= esc_length;
3745
                run=0;
3746
            }else
3747
                run++;
3748
        }
3749
        i= scantable[last];
3750

    
3751
        level= temp[i] + 64;
3752

    
3753
        assert(level - 64);
3754

    
3755
        if((level&(~127)) == 0){
3756
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3757
        }else
3758
            bits+= esc_length;
3759

    
3760
    }
3761

    
3762
    if(last>=0){
3763
        if(s->mb_intra)
3764
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3765
        else
3766
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3767
    }
3768

    
3769
    s->dsp.idct_add(bak, stride, temp);
3770

    
3771
    distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3772

    
3773
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3774
}
3775

    
3776
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3777
    MpegEncContext * const s= (MpegEncContext *)c;
3778
    const uint8_t *scantable= s->intra_scantable.permutated;
3779
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3780
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3781
    int i, last, run, bits, level, start_i;
3782
    const int esc_length= s->ac_esc_length;
3783
    uint8_t * length;
3784
    uint8_t * last_length;
3785

    
3786
    assert(h==8);
3787

    
3788
    s->dsp.diff_pixels(temp, src1, src2, stride);
3789

    
3790
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3791

    
3792
    bits=0;
3793

    
3794
    if (s->mb_intra) {
3795
        start_i = 1;
3796
        length     = s->intra_ac_vlc_length;
3797
        last_length= s->intra_ac_vlc_last_length;
3798
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3799
    } else {
3800
        start_i = 0;
3801
        length     = s->inter_ac_vlc_length;
3802
        last_length= s->inter_ac_vlc_last_length;
3803
    }
3804

    
3805
    if(last>=start_i){
3806
        run=0;
3807
        for(i=start_i; i<last; i++){
3808
            int j= scantable[i];
3809
            level= temp[j];
3810

    
3811
            if(level){
3812
                level+=64;
3813
                if((level&(~127)) == 0){
3814
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3815
                }else
3816
                    bits+= esc_length;
3817
                run=0;
3818
            }else
3819
                run++;
3820
        }
3821
        i= scantable[last];
3822

    
3823
        level= temp[i] + 64;
3824

    
3825
        assert(level - 64);
3826

    
3827
        if((level&(~127)) == 0){
3828
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3829
        }else
3830
            bits+= esc_length;
3831
    }
3832

    
3833
    return bits;
3834
}
3835

    
3836
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3837
    int score=0;
3838
    int x,y;
3839

    
3840
    for(y=1; y<h; y++){
3841
        for(x=0; x<16; x+=4){
3842
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3843
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3844
        }
3845
        s+= stride;
3846
    }
3847

    
3848
    return score;
3849
}
3850

    
3851
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3852
    int score=0;
3853
    int x,y;
3854

    
3855
    for(y=1; y<h; y++){
3856
        for(x=0; x<16; x++){
3857
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3858
        }
3859
        s1+= stride;
3860
        s2+= stride;
3861
    }
3862

    
3863
    return score;
3864
}
3865

    
3866
#define SQ(a) ((a)*(a))
3867
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3868
    int score=0;
3869
    int x,y;
3870

    
3871
    for(y=1; y<h; y++){
3872
        for(x=0; x<16; x+=4){
3873
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3874
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3875
        }
3876
        s+= stride;
3877
    }
3878

    
3879
    return score;
3880
}
3881

    
3882
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3883
    int score=0;
3884
    int x,y;
3885

    
3886
    for(y=1; y<h; y++){
3887
        for(x=0; x<16; x++){
3888
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3889
        }
3890
        s1+= stride;
3891
        s2+= stride;
3892
    }
3893

    
3894
    return score;
3895
}
3896

    
3897
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3898
                               int size){
3899
    int score=0;
3900
    int i;
3901
    for(i=0; i<size; i++)
3902
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3903
    return score;
3904
}
3905

    
3906
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3907
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3908
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3909
#ifdef CONFIG_GPL
3910
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3911
#endif
3912
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3913
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3914
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3915
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3916

    
3917
static void vector_fmul_c(float *dst, const float *src, int len){
3918
    int i;
3919
    for(i=0; i<len; i++)
3920
        dst[i] *= src[i];
3921
}
3922

    
3923
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3924
    int i;
3925
    src1 += len-1;
3926
    for(i=0; i<len; i++)
3927
        dst[i] = src0[i] * src1[-i];
3928
}
3929

    
3930
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3931
    int i;
3932
    for(i=0; i<len; i++)
3933
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3934
}
3935

    
3936
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3937
    int i,j;
3938
    dst += len;
3939
    win += len;
3940
    src0+= len;
3941
    for(i=-len, j=len-1; i<0; i++, j--) {
3942
        float s0 = src0[i];
3943
        float s1 = src1[j];
3944
        float wi = win[i];
3945
        float wj = win[j];
3946
        dst[i] = s0*wj - s1*wi + add_bias;
3947
        dst[j] = s0*wi + s1*wj + add_bias;
3948
    }
3949
}
3950

    
3951
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3952
    int i;
3953
    for(i=0; i<len; i++)
3954
        dst[i] = src[i] * mul;
3955
}
3956

    
3957
static av_always_inline int float_to_int16_one(const float *src){
3958
    int_fast32_t tmp = *(const int32_t*)src;
3959
    if(tmp & 0xf0000){
3960
        tmp = (0x43c0ffff - tmp)>>31;
3961
        // is this faster on some gcc/cpu combinations?
3962
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3963
//      else                 tmp = 0;
3964
    }
3965
    return tmp - 0x8000;
3966
}
3967

    
3968
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3969
    int i;
3970
    for(i=0; i<len; i++)
3971
        dst[i] = float_to_int16_one(src+i);
3972
}
3973

    
3974
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3975
    int i,j,c;
3976
    if(channels==2){
3977
        for(i=0; i<len; i++){
3978
            dst[2*i]   = float_to_int16_one(src[0]+i);
3979
            dst[2*i+1] = float_to_int16_one(src[1]+i);
3980
        }
3981
    }else{
3982
        for(c=0; c<channels; c++)
3983
            for(i=0, j=c; i<len; i++, j+=channels)
3984
                dst[j] = float_to_int16_one(src[c]+i);
3985
    }
3986
}
3987

    
3988
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3989
{
3990
    while (order--)
3991
       *v1++ += *v2++;
3992
}
3993

    
3994
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3995
{
3996
    while (order--)
3997
        *v1++ -= *v2++;
3998
}
3999

    
4000
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4001
{
4002
    int res = 0;
4003

    
4004
    while (order--)
4005
        res += (*v1++ * *v2++) >> shift;
4006

    
4007
    return res;
4008
}
4009

    
4010
#define W0 2048
4011
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4012
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4013
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4014
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4015
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4016
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4017
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4018

    
4019
static void wmv2_idct_row(short * b)
4020
{
4021
    int s1,s2;
4022
    int a0,a1,a2,a3,a4,a5,a6,a7;
4023
    /*step 1*/
4024
    a1 = W1*b[1]+W7*b[7];
4025
    a7 = W7*b[1]-W1*b[7];
4026
    a5 = W5*b[5]+W3*b[3];
4027
    a3 = W3*b[5]-W5*b[3];
4028
    a2 = W2*b[2]+W6*b[6];
4029
    a6 = W6*b[2]-W2*b[6];
4030
    a0 = W0*b[0]+W0*b[4];
4031
    a4 = W0*b[0]-W0*b[4];
4032
    /*step 2*/
4033
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4034
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4035
    /*step 3*/
4036
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4037
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4038
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4039
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4040
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4041
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4042
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4043
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4044
}
4045
static void wmv2_idct_col(short * b)
4046
{
4047
    int s1,s2;
4048
    int a0,a1,a2,a3,a4,a5,a6,a7;
4049
    /*step 1, with extended precision*/
4050
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4051
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4052
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4053
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4054
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4055
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4056
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4057
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4058
    /*step 2*/
4059
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4060
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4061
    /*step 3*/
4062
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4063
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4064
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4065
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4066

    
4067
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4068
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4069
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4070
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4071
}
4072
void ff_wmv2_idct_c(short * block){
4073
    int i;
4074

    
4075
    for(i=0;i<64;i+=8){
4076
        wmv2_idct_row(block+i);
4077
    }
4078
    for(i=0;i<8;i++){
4079
        wmv2_idct_col(block+i);
4080
    }
4081
}
4082
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4083
 converted */
4084
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4085
{
4086
    ff_wmv2_idct_c(block);
4087
    put_pixels_clamped_c(block, dest, line_size);
4088
}
4089
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4090
{
4091
    ff_wmv2_idct_c(block);
4092
    add_pixels_clamped_c(block, dest, line_size);
4093
}
4094
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4095
{
4096
    j_rev_dct (block);
4097
    put_pixels_clamped_c(block, dest, line_size);
4098
}
4099
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4100
{
4101
    j_rev_dct (block);
4102
    add_pixels_clamped_c(block, dest, line_size);
4103
}
4104

    
4105
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4106
{
4107
    j_rev_dct4 (block);
4108
    put_pixels_clamped4_c(block, dest, line_size);
4109
}
4110
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4111
{
4112
    j_rev_dct4 (block);
4113
    add_pixels_clamped4_c(block, dest, line_size);
4114
}
4115

    
4116
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4117
{
4118
    j_rev_dct2 (block);
4119
    put_pixels_clamped2_c(block, dest, line_size);
4120
}
4121
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4122
{
4123
    j_rev_dct2 (block);
4124
    add_pixels_clamped2_c(block, dest, line_size);
4125
}
4126

    
4127
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4128
{
4129
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4130

    
4131
    dest[0] = cm[(block[0] + 4)>>3];
4132
}
4133
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4134
{
4135
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4136

    
4137
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4138
}
4139

    
4140
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4141

    
4142
/* init static data */
4143
void dsputil_static_init(void)
4144
{
4145
    int i;
4146

    
4147
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4148
    for(i=0;i<MAX_NEG_CROP;i++) {
4149
        ff_cropTbl[i] = 0;
4150
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4151
    }
4152

    
4153
    for(i=0;i<512;i++) {
4154
        ff_squareTbl[i] = (i - 256) * (i - 256);
4155
    }
4156

    
4157
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4158
}
4159

    
4160
int ff_check_alignment(void){
4161
    static int did_fail=0;
4162
    DECLARE_ALIGNED_16(int, aligned);
4163

    
4164
    if((long)&aligned & 15){
4165
        if(!did_fail){
4166
#if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4167
            av_log(NULL, AV_LOG_ERROR,
4168
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4169
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4170
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4171
                "Do not report crashes to FFmpeg developers.\n");
4172
#endif
4173
            did_fail=1;
4174
        }
4175
        return -1;
4176
    }
4177
    return 0;
4178
}
4179

    
4180
void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4181
{
4182
    int i;
4183

    
4184
    ff_check_alignment();
4185

    
4186
#ifdef CONFIG_ENCODERS
4187
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4188
        c->fdct = fdct_ifast;
4189
        c->fdct248 = fdct_ifast248;
4190
    }
4191
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4192
        c->fdct = ff_faandct;
4193
        c->fdct248 = ff_faandct248;
4194
    }
4195
    else {
4196
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4197
        c->fdct248 = ff_fdct248_islow;
4198
    }
4199
#endif //CONFIG_ENCODERS
4200

    
4201
    if(avctx->lowres==1){
4202
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4203
            c->idct_put= ff_jref_idct4_put;
4204
            c->idct_add= ff_jref_idct4_add;
4205
        }else{
4206
            c->idct_put= ff_h264_lowres_idct_put_c;
4207
            c->idct_add= ff_h264_lowres_idct_add_c;
4208
        }
4209
        c->idct    = j_rev_dct4;
4210
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4211
    }else if(avctx->lowres==2){
4212
        c->idct_put= ff_jref_idct2_put;
4213
        c->idct_add= ff_jref_idct2_add;
4214
        c->idct    = j_rev_dct2;
4215
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4216
    }else if(avctx->lowres==3){
4217
        c->idct_put= ff_jref_idct1_put;
4218
        c->idct_add= ff_jref_idct1_add;
4219
        c->idct    = j_rev_dct1;
4220
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4221
    }else{
4222
        if(avctx->idct_algo==FF_IDCT_INT){
4223
            c->idct_put= ff_jref_idct_put;
4224
            c->idct_add= ff_jref_idct_add;
4225
            c->idct    = j_rev_dct;
4226
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4227
        }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4228
                avctx->idct_algo==FF_IDCT_VP3){
4229
            c->idct_put= ff_vp3_idct_put_c;
4230
            c->idct_add= ff_vp3_idct_add_c;
4231
            c->idct    = ff_vp3_idct_c;
4232
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4233
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4234
            c->idct_put= ff_wmv2_idct_put_c;
4235
            c->idct_add= ff_wmv2_idct_add_c;
4236
            c->idct    = ff_wmv2_idct_c;
4237
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4238
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4239
            c->idct_put= ff_faanidct_put;
4240
            c->idct_add= ff_faanidct_add;
4241
            c->idct    = ff_faanidct;
4242
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4243
        }else{ //accurate/default
4244
            c->idct_put= ff_simple_idct_put;
4245
            c->idct_add= ff_simple_idct_add;
4246
            c->idct    = ff_simple_idct;
4247
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4248
        }
4249
    }
4250

    
4251
    if