Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 50e23ae9

History | View | Annotate | Download (170 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "h263.h"
37
#include "snow.h"
38

    
39
/* snow.c */
40
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41

    
42
/* vorbis.c */
43
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44

    
45
/* ac3dec.c */
46
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47

    
48
/* flacenc.c */
49
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50

    
51
/* pngdec.c */
52
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53

    
54
/* eaidct.c */
55
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56

    
57
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58
uint32_t ff_squareTbl[512] = {0, };
59

    
60
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61
#define pb_7f (~0UL/255 * 0x7f)
62
#define pb_80 (~0UL/255 * 0x80)
63

    
64
const uint8_t ff_zigzag_direct[64] = {
65
    0,   1,  8, 16,  9,  2,  3, 10,
66
    17, 24, 32, 25, 18, 11,  4,  5,
67
    12, 19, 26, 33, 40, 48, 41, 34,
68
    27, 20, 13,  6,  7, 14, 21, 28,
69
    35, 42, 49, 56, 57, 50, 43, 36,
70
    29, 22, 15, 23, 30, 37, 44, 51,
71
    58, 59, 52, 45, 38, 31, 39, 46,
72
    53, 60, 61, 54, 47, 55, 62, 63
73
};
74

    
75
/* Specific zigzag scan for 248 idct. NOTE that unlike the
76
   specification, we interleave the fields */
77
const uint8_t ff_zigzag248_direct[64] = {
78
     0,  8,  1,  9, 16, 24,  2, 10,
79
    17, 25, 32, 40, 48, 56, 33, 41,
80
    18, 26,  3, 11,  4, 12, 19, 27,
81
    34, 42, 49, 57, 50, 58, 35, 43,
82
    20, 28,  5, 13,  6, 14, 21, 29,
83
    36, 44, 51, 59, 52, 60, 37, 45,
84
    22, 30,  7, 15, 23, 31, 38, 46,
85
    53, 61, 54, 62, 39, 47, 55, 63,
86
};
87

    
88
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89
DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
90

    
91
const uint8_t ff_alternate_horizontal_scan[64] = {
92
    0,  1,   2,  3,  8,  9, 16, 17,
93
    10, 11,  4,  5,  6,  7, 15, 14,
94
    13, 12, 19, 18, 24, 25, 32, 33,
95
    26, 27, 20, 21, 22, 23, 28, 29,
96
    30, 31, 34, 35, 40, 41, 48, 49,
97
    42, 43, 36, 37, 38, 39, 44, 45,
98
    46, 47, 50, 51, 56, 57, 58, 59,
99
    52, 53, 54, 55, 60, 61, 62, 63,
100
};
101

    
102
const uint8_t ff_alternate_vertical_scan[64] = {
103
    0,  8,  16, 24,  1,  9,  2, 10,
104
    17, 25, 32, 40, 48, 56, 57, 49,
105
    41, 33, 26, 18,  3, 11,  4, 12,
106
    19, 27, 34, 42, 50, 58, 35, 43,
107
    51, 59, 20, 28,  5, 13,  6, 14,
108
    21, 29, 36, 44, 52, 60, 37, 45,
109
    53, 61, 22, 30,  7, 15, 23, 31,
110
    38, 46, 54, 62, 39, 47, 55, 63,
111
};
112

    
113
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114
const uint32_t ff_inverse[256]={
115
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
116
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
117
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
118
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
119
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
120
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
121
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
122
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
123
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
124
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
125
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
126
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
127
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
128
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
129
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
130
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
131
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
132
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
133
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
134
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
135
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
136
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
137
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
138
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
139
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
140
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
141
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
142
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
143
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
144
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
145
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
146
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
147
};
148

    
149
/* Input permutation for the simple_idct_mmx */
150
static const uint8_t simple_mmx_permutation[64]={
151
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159
};
160

    
161
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162

    
163
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164
    int i;
165
    int end;
166

    
167
    st->scantable= src_scantable;
168

    
169
    for(i=0; i<64; i++){
170
        int j;
171
        j = src_scantable[i];
172
        st->permutated[i] = permutation[j];
173
#if ARCH_PPC
174
        st->inverse[j] = i;
175
#endif
176
    }
177

    
178
    end=-1;
179
    for(i=0; i<64; i++){
180
        int j;
181
        j = st->permutated[i];
182
        if(j>end) end=j;
183
        st->raster_end[i]= end;
184
    }
185
}
186

    
187
static int pix_sum_c(uint8_t * pix, int line_size)
188
{
189
    int s, i, j;
190

    
191
    s = 0;
192
    for (i = 0; i < 16; i++) {
193
        for (j = 0; j < 16; j += 8) {
194
            s += pix[0];
195
            s += pix[1];
196
            s += pix[2];
197
            s += pix[3];
198
            s += pix[4];
199
            s += pix[5];
200
            s += pix[6];
201
            s += pix[7];
202
            pix += 8;
203
        }
204
        pix += line_size - 16;
205
    }
206
    return s;
207
}
208

    
209
static int pix_norm1_c(uint8_t * pix, int line_size)
210
{
211
    int s, i, j;
212
    uint32_t *sq = ff_squareTbl + 256;
213

    
214
    s = 0;
215
    for (i = 0; i < 16; i++) {
216
        for (j = 0; j < 16; j += 8) {
217
#if 0
218
            s += sq[pix[0]];
219
            s += sq[pix[1]];
220
            s += sq[pix[2]];
221
            s += sq[pix[3]];
222
            s += sq[pix[4]];
223
            s += sq[pix[5]];
224
            s += sq[pix[6]];
225
            s += sq[pix[7]];
226
#else
227
#if LONG_MAX > 2147483647
228
            register uint64_t x=*(uint64_t*)pix;
229
            s += sq[x&0xff];
230
            s += sq[(x>>8)&0xff];
231
            s += sq[(x>>16)&0xff];
232
            s += sq[(x>>24)&0xff];
233
            s += sq[(x>>32)&0xff];
234
            s += sq[(x>>40)&0xff];
235
            s += sq[(x>>48)&0xff];
236
            s += sq[(x>>56)&0xff];
237
#else
238
            register uint32_t x=*(uint32_t*)pix;
239
            s += sq[x&0xff];
240
            s += sq[(x>>8)&0xff];
241
            s += sq[(x>>16)&0xff];
242
            s += sq[(x>>24)&0xff];
243
            x=*(uint32_t*)(pix+4);
244
            s += sq[x&0xff];
245
            s += sq[(x>>8)&0xff];
246
            s += sq[(x>>16)&0xff];
247
            s += sq[(x>>24)&0xff];
248
#endif
249
#endif
250
            pix += 8;
251
        }
252
        pix += line_size - 16;
253
    }
254
    return s;
255
}
256

    
257
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258
    int i;
259

    
260
    for(i=0; i+8<=w; i+=8){
261
        dst[i+0]= bswap_32(src[i+0]);
262
        dst[i+1]= bswap_32(src[i+1]);
263
        dst[i+2]= bswap_32(src[i+2]);
264
        dst[i+3]= bswap_32(src[i+3]);
265
        dst[i+4]= bswap_32(src[i+4]);
266
        dst[i+5]= bswap_32(src[i+5]);
267
        dst[i+6]= bswap_32(src[i+6]);
268
        dst[i+7]= bswap_32(src[i+7]);
269
    }
270
    for(;i<w; i++){
271
        dst[i+0]= bswap_32(src[i+0]);
272
    }
273
}
274

    
275
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276
{
277
    int s, i;
278
    uint32_t *sq = ff_squareTbl + 256;
279

    
280
    s = 0;
281
    for (i = 0; i < h; i++) {
282
        s += sq[pix1[0] - pix2[0]];
283
        s += sq[pix1[1] - pix2[1]];
284
        s += sq[pix1[2] - pix2[2]];
285
        s += sq[pix1[3] - pix2[3]];
286
        pix1 += line_size;
287
        pix2 += line_size;
288
    }
289
    return s;
290
}
291

    
292
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293
{
294
    int s, i;
295
    uint32_t *sq = ff_squareTbl + 256;
296

    
297
    s = 0;
298
    for (i = 0; i < h; i++) {
299
        s += sq[pix1[0] - pix2[0]];
300
        s += sq[pix1[1] - pix2[1]];
301
        s += sq[pix1[2] - pix2[2]];
302
        s += sq[pix1[3] - pix2[3]];
303
        s += sq[pix1[4] - pix2[4]];
304
        s += sq[pix1[5] - pix2[5]];
305
        s += sq[pix1[6] - pix2[6]];
306
        s += sq[pix1[7] - pix2[7]];
307
        pix1 += line_size;
308
        pix2 += line_size;
309
    }
310
    return s;
311
}
312

    
313
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314
{
315
    int s, i;
316
    uint32_t *sq = ff_squareTbl + 256;
317

    
318
    s = 0;
319
    for (i = 0; i < h; i++) {
320
        s += sq[pix1[ 0] - pix2[ 0]];
321
        s += sq[pix1[ 1] - pix2[ 1]];
322
        s += sq[pix1[ 2] - pix2[ 2]];
323
        s += sq[pix1[ 3] - pix2[ 3]];
324
        s += sq[pix1[ 4] - pix2[ 4]];
325
        s += sq[pix1[ 5] - pix2[ 5]];
326
        s += sq[pix1[ 6] - pix2[ 6]];
327
        s += sq[pix1[ 7] - pix2[ 7]];
328
        s += sq[pix1[ 8] - pix2[ 8]];
329
        s += sq[pix1[ 9] - pix2[ 9]];
330
        s += sq[pix1[10] - pix2[10]];
331
        s += sq[pix1[11] - pix2[11]];
332
        s += sq[pix1[12] - pix2[12]];
333
        s += sq[pix1[13] - pix2[13]];
334
        s += sq[pix1[14] - pix2[14]];
335
        s += sq[pix1[15] - pix2[15]];
336

    
337
        pix1 += line_size;
338
        pix2 += line_size;
339
    }
340
    return s;
341
}
342

    
343

    
344
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
345
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346
    int s, i, j;
347
    const int dec_count= w==8 ? 3 : 4;
348
    int tmp[32*32];
349
    int level, ori;
350
    static const int scale[2][2][4][4]={
351
      {
352
        {
353
            // 9/7 8x8 dec=3
354
            {268, 239, 239, 213},
355
            {  0, 224, 224, 152},
356
            {  0, 135, 135, 110},
357
        },{
358
            // 9/7 16x16 or 32x32 dec=4
359
            {344, 310, 310, 280},
360
            {  0, 320, 320, 228},
361
            {  0, 175, 175, 136},
362
            {  0, 129, 129, 102},
363
        }
364
      },{
365
        {
366
            // 5/3 8x8 dec=3
367
            {275, 245, 245, 218},
368
            {  0, 230, 230, 156},
369
            {  0, 138, 138, 113},
370
        },{
371
            // 5/3 16x16 or 32x32 dec=4
372
            {352, 317, 317, 286},
373
            {  0, 328, 328, 233},
374
            {  0, 180, 180, 140},
375
            {  0, 132, 132, 105},
376
        }
377
      }
378
    };
379

    
380
    for (i = 0; i < h; i++) {
381
        for (j = 0; j < w; j+=4) {
382
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386
        }
387
        pix1 += line_size;
388
        pix2 += line_size;
389
    }
390

    
391
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392

    
393
    s=0;
394
    assert(w==h);
395
    for(level=0; level<dec_count; level++){
396
        for(ori= level ? 1 : 0; ori<4; ori++){
397
            int size= w>>(dec_count-level);
398
            int sx= (ori&1) ? size : 0;
399
            int stride= 32<<(dec_count-level);
400
            int sy= (ori&2) ? stride>>1 : 0;
401

    
402
            for(i=0; i<size; i++){
403
                for(j=0; j<size; j++){
404
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405
                    s += FFABS(v);
406
                }
407
            }
408
        }
409
    }
410
    assert(s>=0);
411
    return s>>9;
412
}
413

    
414
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
416
}
417

    
418
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
420
}
421

    
422
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
424
}
425

    
426
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
428
}
429

    
430
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
432
}
433

    
434
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
436
}
437
#endif
438

    
439
/* draw the edges of width 'w' of an image of size width, height */
440
//FIXME check that this is ok for mpeg4 interlaced
441
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442
{
443
    uint8_t *ptr, *last_line;
444
    int i;
445

    
446
    last_line = buf + (height - 1) * wrap;
447
    for(i=0;i<w;i++) {
448
        /* top and bottom */
449
        memcpy(buf - (i + 1) * wrap, buf, width);
450
        memcpy(last_line + (i + 1) * wrap, last_line, width);
451
    }
452
    /* left and right */
453
    ptr = buf;
454
    for(i=0;i<height;i++) {
455
        memset(ptr - w, ptr[0], w);
456
        memset(ptr + width, ptr[width-1], w);
457
        ptr += wrap;
458
    }
459
    /* corners */
460
    for(i=0;i<w;i++) {
461
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465
    }
466
}
467

    
468
/**
469
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470
 * @param buf destination buffer
471
 * @param src source buffer
472
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473
 * @param block_w width of block
474
 * @param block_h height of block
475
 * @param src_x x coordinate of the top left sample of the block in the source buffer
476
 * @param src_y y coordinate of the top left sample of the block in the source buffer
477
 * @param w width of the source buffer
478
 * @param h height of the source buffer
479
 */
480
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481
                                    int src_x, int src_y, int w, int h){
482
    int x, y;
483
    int start_y, start_x, end_y, end_x;
484

    
485
    if(src_y>= h){
486
        src+= (h-1-src_y)*linesize;
487
        src_y=h-1;
488
    }else if(src_y<=-block_h){
489
        src+= (1-block_h-src_y)*linesize;
490
        src_y=1-block_h;
491
    }
492
    if(src_x>= w){
493
        src+= (w-1-src_x);
494
        src_x=w-1;
495
    }else if(src_x<=-block_w){
496
        src+= (1-block_w-src_x);
497
        src_x=1-block_w;
498
    }
499

    
500
    start_y= FFMAX(0, -src_y);
501
    start_x= FFMAX(0, -src_x);
502
    end_y= FFMIN(block_h, h-src_y);
503
    end_x= FFMIN(block_w, w-src_x);
504

    
505
    // copy existing part
506
    for(y=start_y; y<end_y; y++){
507
        for(x=start_x; x<end_x; x++){
508
            buf[x + y*linesize]= src[x + y*linesize];
509
        }
510
    }
511

    
512
    //top
513
    for(y=0; y<start_y; y++){
514
        for(x=start_x; x<end_x; x++){
515
            buf[x + y*linesize]= buf[x + start_y*linesize];
516
        }
517
    }
518

    
519
    //bottom
520
    for(y=end_y; y<block_h; y++){
521
        for(x=start_x; x<end_x; x++){
522
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
523
        }
524
    }
525

    
526
    for(y=0; y<block_h; y++){
527
       //left
528
        for(x=0; x<start_x; x++){
529
            buf[x + y*linesize]= buf[start_x + y*linesize];
530
        }
531

    
532
       //right
533
        for(x=end_x; x<block_w; x++){
534
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535
        }
536
    }
537
}
538

    
539
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540
{
541
    int i;
542

    
543
    /* read the pixels */
544
    for(i=0;i<8;i++) {
545
        block[0] = pixels[0];
546
        block[1] = pixels[1];
547
        block[2] = pixels[2];
548
        block[3] = pixels[3];
549
        block[4] = pixels[4];
550
        block[5] = pixels[5];
551
        block[6] = pixels[6];
552
        block[7] = pixels[7];
553
        pixels += line_size;
554
        block += 8;
555
    }
556
}
557

    
558
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559
                          const uint8_t *s2, int stride){
560
    int i;
561

    
562
    /* read the pixels */
563
    for(i=0;i<8;i++) {
564
        block[0] = s1[0] - s2[0];
565
        block[1] = s1[1] - s2[1];
566
        block[2] = s1[2] - s2[2];
567
        block[3] = s1[3] - s2[3];
568
        block[4] = s1[4] - s2[4];
569
        block[5] = s1[5] - s2[5];
570
        block[6] = s1[6] - s2[6];
571
        block[7] = s1[7] - s2[7];
572
        s1 += stride;
573
        s2 += stride;
574
        block += 8;
575
    }
576
}
577

    
578

    
579
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580
                                 int line_size)
581
{
582
    int i;
583
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584

    
585
    /* read the pixels */
586
    for(i=0;i<8;i++) {
587
        pixels[0] = cm[block[0]];
588
        pixels[1] = cm[block[1]];
589
        pixels[2] = cm[block[2]];
590
        pixels[3] = cm[block[3]];
591
        pixels[4] = cm[block[4]];
592
        pixels[5] = cm[block[5]];
593
        pixels[6] = cm[block[6]];
594
        pixels[7] = cm[block[7]];
595

    
596
        pixels += line_size;
597
        block += 8;
598
    }
599
}
600

    
601
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602
                                 int line_size)
603
{
604
    int i;
605
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606

    
607
    /* read the pixels */
608
    for(i=0;i<4;i++) {
609
        pixels[0] = cm[block[0]];
610
        pixels[1] = cm[block[1]];
611
        pixels[2] = cm[block[2]];
612
        pixels[3] = cm[block[3]];
613

    
614
        pixels += line_size;
615
        block += 8;
616
    }
617
}
618

    
619
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620
                                 int line_size)
621
{
622
    int i;
623
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624

    
625
    /* read the pixels */
626
    for(i=0;i<2;i++) {
627
        pixels[0] = cm[block[0]];
628
        pixels[1] = cm[block[1]];
629

    
630
        pixels += line_size;
631
        block += 8;
632
    }
633
}
634

    
635
static void put_signed_pixels_clamped_c(const DCTELEM *block,
636
                                        uint8_t *restrict pixels,
637
                                        int line_size)
638
{
639
    int i, j;
640

    
641
    for (i = 0; i < 8; i++) {
642
        for (j = 0; j < 8; j++) {
643
            if (*block < -128)
644
                *pixels = 0;
645
            else if (*block > 127)
646
                *pixels = 255;
647
            else
648
                *pixels = (uint8_t)(*block + 128);
649
            block++;
650
            pixels++;
651
        }
652
        pixels += (line_size - 8);
653
    }
654
}
655

    
656
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657
                          int line_size)
658
{
659
    int i;
660
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661

    
662
    /* read the pixels */
663
    for(i=0;i<8;i++) {
664
        pixels[0] = cm[pixels[0] + block[0]];
665
        pixels[1] = cm[pixels[1] + block[1]];
666
        pixels[2] = cm[pixels[2] + block[2]];
667
        pixels[3] = cm[pixels[3] + block[3]];
668
        pixels[4] = cm[pixels[4] + block[4]];
669
        pixels[5] = cm[pixels[5] + block[5]];
670
        pixels[6] = cm[pixels[6] + block[6]];
671
        pixels[7] = cm[pixels[7] + block[7]];
672
        pixels += line_size;
673
        block += 8;
674
    }
675
}
676

    
677
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
678
                          int line_size)
679
{
680
    int i;
681
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682

    
683
    /* read the pixels */
684
    for(i=0;i<4;i++) {
685
        pixels[0] = cm[pixels[0] + block[0]];
686
        pixels[1] = cm[pixels[1] + block[1]];
687
        pixels[2] = cm[pixels[2] + block[2]];
688
        pixels[3] = cm[pixels[3] + block[3]];
689
        pixels += line_size;
690
        block += 8;
691
    }
692
}
693

    
694
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
695
                          int line_size)
696
{
697
    int i;
698
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699

    
700
    /* read the pixels */
701
    for(i=0;i<2;i++) {
702
        pixels[0] = cm[pixels[0] + block[0]];
703
        pixels[1] = cm[pixels[1] + block[1]];
704
        pixels += line_size;
705
        block += 8;
706
    }
707
}
708

    
709
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
710
{
711
    int i;
712
    for(i=0;i<8;i++) {
713
        pixels[0] += block[0];
714
        pixels[1] += block[1];
715
        pixels[2] += block[2];
716
        pixels[3] += block[3];
717
        pixels[4] += block[4];
718
        pixels[5] += block[5];
719
        pixels[6] += block[6];
720
        pixels[7] += block[7];
721
        pixels += line_size;
722
        block += 8;
723
    }
724
}
725

    
726
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
727
{
728
    int i;
729
    for(i=0;i<4;i++) {
730
        pixels[0] += block[0];
731
        pixels[1] += block[1];
732
        pixels[2] += block[2];
733
        pixels[3] += block[3];
734
        pixels += line_size;
735
        block += 4;
736
    }
737
}
738

    
739
static int sum_abs_dctelem_c(DCTELEM *block)
740
{
741
    int sum=0, i;
742
    for(i=0; i<64; i++)
743
        sum+= FFABS(block[i]);
744
    return sum;
745
}
746

    
747
#if 0
748

749
#define PIXOP2(OPNAME, OP) \
750
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
751
{\
752
    int i;\
753
    for(i=0; i<h; i++){\
754
        OP(*((uint64_t*)block), AV_RN64(pixels));\
755
        pixels+=line_size;\
756
        block +=line_size;\
757
    }\
758
}\
759
\
760
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761
{\
762
    int i;\
763
    for(i=0; i<h; i++){\
764
        const uint64_t a= AV_RN64(pixels  );\
765
        const uint64_t b= AV_RN64(pixels+1);\
766
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
767
        pixels+=line_size;\
768
        block +=line_size;\
769
    }\
770
}\
771
\
772
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
773
{\
774
    int i;\
775
    for(i=0; i<h; i++){\
776
        const uint64_t a= AV_RN64(pixels  );\
777
        const uint64_t b= AV_RN64(pixels+1);\
778
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
779
        pixels+=line_size;\
780
        block +=line_size;\
781
    }\
782
}\
783
\
784
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785
{\
786
    int i;\
787
    for(i=0; i<h; i++){\
788
        const uint64_t a= AV_RN64(pixels          );\
789
        const uint64_t b= AV_RN64(pixels+line_size);\
790
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
791
        pixels+=line_size;\
792
        block +=line_size;\
793
    }\
794
}\
795
\
796
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797
{\
798
    int i;\
799
    for(i=0; i<h; i++){\
800
        const uint64_t a= AV_RN64(pixels          );\
801
        const uint64_t b= AV_RN64(pixels+line_size);\
802
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
803
        pixels+=line_size;\
804
        block +=line_size;\
805
    }\
806
}\
807
\
808
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
809
{\
810
        int i;\
811
        const uint64_t a= AV_RN64(pixels  );\
812
        const uint64_t b= AV_RN64(pixels+1);\
813
        uint64_t l0=  (a&0x0303030303030303ULL)\
814
                    + (b&0x0303030303030303ULL)\
815
                    + 0x0202020202020202ULL;\
816
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
818
        uint64_t l1,h1;\
819
\
820
        pixels+=line_size;\
821
        for(i=0; i<h; i+=2){\
822
            uint64_t a= AV_RN64(pixels  );\
823
            uint64_t b= AV_RN64(pixels+1);\
824
            l1=  (a&0x0303030303030303ULL)\
825
               + (b&0x0303030303030303ULL);\
826
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
829
            pixels+=line_size;\
830
            block +=line_size;\
831
            a= AV_RN64(pixels  );\
832
            b= AV_RN64(pixels+1);\
833
            l0=  (a&0x0303030303030303ULL)\
834
               + (b&0x0303030303030303ULL)\
835
               + 0x0202020202020202ULL;\
836
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
839
            pixels+=line_size;\
840
            block +=line_size;\
841
        }\
842
}\
843
\
844
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845
{\
846
        int i;\
847
        const uint64_t a= AV_RN64(pixels  );\
848
        const uint64_t b= AV_RN64(pixels+1);\
849
        uint64_t l0=  (a&0x0303030303030303ULL)\
850
                    + (b&0x0303030303030303ULL)\
851
                    + 0x0101010101010101ULL;\
852
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
854
        uint64_t l1,h1;\
855
\
856
        pixels+=line_size;\
857
        for(i=0; i<h; i+=2){\
858
            uint64_t a= AV_RN64(pixels  );\
859
            uint64_t b= AV_RN64(pixels+1);\
860
            l1=  (a&0x0303030303030303ULL)\
861
               + (b&0x0303030303030303ULL);\
862
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
865
            pixels+=line_size;\
866
            block +=line_size;\
867
            a= AV_RN64(pixels  );\
868
            b= AV_RN64(pixels+1);\
869
            l0=  (a&0x0303030303030303ULL)\
870
               + (b&0x0303030303030303ULL)\
871
               + 0x0101010101010101ULL;\
872
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875
            pixels+=line_size;\
876
            block +=line_size;\
877
        }\
878
}\
879
\
880
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
881
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887

888
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889
#else // 64 bit variant
890

    
891
#define PIXOP2(OPNAME, OP) \
892
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893
    int i;\
894
    for(i=0; i<h; i++){\
895
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
896
        pixels+=line_size;\
897
        block +=line_size;\
898
    }\
899
}\
900
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901
    int i;\
902
    for(i=0; i<h; i++){\
903
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
904
        pixels+=line_size;\
905
        block +=line_size;\
906
    }\
907
}\
908
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909
    int i;\
910
    for(i=0; i<h; i++){\
911
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
912
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
913
        pixels+=line_size;\
914
        block +=line_size;\
915
    }\
916
}\
917
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
919
}\
920
\
921
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922
                                                int src_stride1, int src_stride2, int h){\
923
    int i;\
924
    for(i=0; i<h; i++){\
925
        uint32_t a,b;\
926
        a= AV_RN32(&src1[i*src_stride1  ]);\
927
        b= AV_RN32(&src2[i*src_stride2  ]);\
928
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
929
        a= AV_RN32(&src1[i*src_stride1+4]);\
930
        b= AV_RN32(&src2[i*src_stride2+4]);\
931
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
932
    }\
933
}\
934
\
935
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936
                                                int src_stride1, int src_stride2, int h){\
937
    int i;\
938
    for(i=0; i<h; i++){\
939
        uint32_t a,b;\
940
        a= AV_RN32(&src1[i*src_stride1  ]);\
941
        b= AV_RN32(&src2[i*src_stride2  ]);\
942
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
943
        a= AV_RN32(&src1[i*src_stride1+4]);\
944
        b= AV_RN32(&src2[i*src_stride2+4]);\
945
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
946
    }\
947
}\
948
\
949
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950
                                                int src_stride1, int src_stride2, int h){\
951
    int i;\
952
    for(i=0; i<h; i++){\
953
        uint32_t a,b;\
954
        a= AV_RN32(&src1[i*src_stride1  ]);\
955
        b= AV_RN32(&src2[i*src_stride2  ]);\
956
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
957
    }\
958
}\
959
\
960
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961
                                                int src_stride1, int src_stride2, int h){\
962
    int i;\
963
    for(i=0; i<h; i++){\
964
        uint32_t a,b;\
965
        a= AV_RN16(&src1[i*src_stride1  ]);\
966
        b= AV_RN16(&src2[i*src_stride2  ]);\
967
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
968
    }\
969
}\
970
\
971
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972
                                                int src_stride1, int src_stride2, int h){\
973
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
974
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
975
}\
976
\
977
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978
                                                int src_stride1, int src_stride2, int h){\
979
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
980
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
981
}\
982
\
983
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
985
}\
986
\
987
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
989
}\
990
\
991
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
993
}\
994
\
995
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
997
}\
998
\
999
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001
    int i;\
1002
    for(i=0; i<h; i++){\
1003
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1004
        a= AV_RN32(&src1[i*src_stride1]);\
1005
        b= AV_RN32(&src2[i*src_stride2]);\
1006
        c= AV_RN32(&src3[i*src_stride3]);\
1007
        d= AV_RN32(&src4[i*src_stride4]);\
1008
        l0=  (a&0x03030303UL)\
1009
           + (b&0x03030303UL)\
1010
           + 0x02020202UL;\
1011
        h0= ((a&0xFCFCFCFCUL)>>2)\
1012
          + ((b&0xFCFCFCFCUL)>>2);\
1013
        l1=  (c&0x03030303UL)\
1014
           + (d&0x03030303UL);\
1015
        h1= ((c&0xFCFCFCFCUL)>>2)\
1016
          + ((d&0xFCFCFCFCUL)>>2);\
1017
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018
        a= AV_RN32(&src1[i*src_stride1+4]);\
1019
        b= AV_RN32(&src2[i*src_stride2+4]);\
1020
        c= AV_RN32(&src3[i*src_stride3+4]);\
1021
        d= AV_RN32(&src4[i*src_stride4+4]);\
1022
        l0=  (a&0x03030303UL)\
1023
           + (b&0x03030303UL)\
1024
           + 0x02020202UL;\
1025
        h0= ((a&0xFCFCFCFCUL)>>2)\
1026
          + ((b&0xFCFCFCFCUL)>>2);\
1027
        l1=  (c&0x03030303UL)\
1028
           + (d&0x03030303UL);\
1029
        h1= ((c&0xFCFCFCFCUL)>>2)\
1030
          + ((d&0xFCFCFCFCUL)>>2);\
1031
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032
    }\
1033
}\
1034
\
1035
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037
}\
1038
\
1039
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041
}\
1042
\
1043
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045
}\
1046
\
1047
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049
}\
1050
\
1051
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053
    int i;\
1054
    for(i=0; i<h; i++){\
1055
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1056
        a= AV_RN32(&src1[i*src_stride1]);\
1057
        b= AV_RN32(&src2[i*src_stride2]);\
1058
        c= AV_RN32(&src3[i*src_stride3]);\
1059
        d= AV_RN32(&src4[i*src_stride4]);\
1060
        l0=  (a&0x03030303UL)\
1061
           + (b&0x03030303UL)\
1062
           + 0x01010101UL;\
1063
        h0= ((a&0xFCFCFCFCUL)>>2)\
1064
          + ((b&0xFCFCFCFCUL)>>2);\
1065
        l1=  (c&0x03030303UL)\
1066
           + (d&0x03030303UL);\
1067
        h1= ((c&0xFCFCFCFCUL)>>2)\
1068
          + ((d&0xFCFCFCFCUL)>>2);\
1069
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070
        a= AV_RN32(&src1[i*src_stride1+4]);\
1071
        b= AV_RN32(&src2[i*src_stride2+4]);\
1072
        c= AV_RN32(&src3[i*src_stride3+4]);\
1073
        d= AV_RN32(&src4[i*src_stride4+4]);\
1074
        l0=  (a&0x03030303UL)\
1075
           + (b&0x03030303UL)\
1076
           + 0x01010101UL;\
1077
        h0= ((a&0xFCFCFCFCUL)>>2)\
1078
          + ((b&0xFCFCFCFCUL)>>2);\
1079
        l1=  (c&0x03030303UL)\
1080
           + (d&0x03030303UL);\
1081
        h1= ((c&0xFCFCFCFCUL)>>2)\
1082
          + ((d&0xFCFCFCFCUL)>>2);\
1083
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084
    }\
1085
}\
1086
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090
}\
1091
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095
}\
1096
\
1097
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098
{\
1099
        int i, a0, b0, a1, b1;\
1100
        a0= pixels[0];\
1101
        b0= pixels[1] + 2;\
1102
        a0 += b0;\
1103
        b0 += pixels[2];\
1104
\
1105
        pixels+=line_size;\
1106
        for(i=0; i<h; i+=2){\
1107
            a1= pixels[0];\
1108
            b1= pixels[1];\
1109
            a1 += b1;\
1110
            b1 += pixels[2];\
1111
\
1112
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1113
            block[1]= (b1+b0)>>2;\
1114
\
1115
            pixels+=line_size;\
1116
            block +=line_size;\
1117
\
1118
            a0= pixels[0];\
1119
            b0= pixels[1] + 2;\
1120
            a0 += b0;\
1121
            b0 += pixels[2];\
1122
\
1123
            block[0]= (a1+a0)>>2;\
1124
            block[1]= (b1+b0)>>2;\
1125
            pixels+=line_size;\
1126
            block +=line_size;\
1127
        }\
1128
}\
1129
\
1130
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131
{\
1132
        int i;\
1133
        const uint32_t a= AV_RN32(pixels  );\
1134
        const uint32_t b= AV_RN32(pixels+1);\
1135
        uint32_t l0=  (a&0x03030303UL)\
1136
                    + (b&0x03030303UL)\
1137
                    + 0x02020202UL;\
1138
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139
                   + ((b&0xFCFCFCFCUL)>>2);\
1140
        uint32_t l1,h1;\
1141
\
1142
        pixels+=line_size;\
1143
        for(i=0; i<h; i+=2){\
1144
            uint32_t a= AV_RN32(pixels  );\
1145
            uint32_t b= AV_RN32(pixels+1);\
1146
            l1=  (a&0x03030303UL)\
1147
               + (b&0x03030303UL);\
1148
            h1= ((a&0xFCFCFCFCUL)>>2)\
1149
              + ((b&0xFCFCFCFCUL)>>2);\
1150
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151
            pixels+=line_size;\
1152
            block +=line_size;\
1153
            a= AV_RN32(pixels  );\
1154
            b= AV_RN32(pixels+1);\
1155
            l0=  (a&0x03030303UL)\
1156
               + (b&0x03030303UL)\
1157
               + 0x02020202UL;\
1158
            h0= ((a&0xFCFCFCFCUL)>>2)\
1159
              + ((b&0xFCFCFCFCUL)>>2);\
1160
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161
            pixels+=line_size;\
1162
            block +=line_size;\
1163
        }\
1164
}\
1165
\
1166
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167
{\
1168
    int j;\
1169
    for(j=0; j<2; j++){\
1170
        int i;\
1171
        const uint32_t a= AV_RN32(pixels  );\
1172
        const uint32_t b= AV_RN32(pixels+1);\
1173
        uint32_t l0=  (a&0x03030303UL)\
1174
                    + (b&0x03030303UL)\
1175
                    + 0x02020202UL;\
1176
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177
                   + ((b&0xFCFCFCFCUL)>>2);\
1178
        uint32_t l1,h1;\
1179
\
1180
        pixels+=line_size;\
1181
        for(i=0; i<h; i+=2){\
1182
            uint32_t a= AV_RN32(pixels  );\
1183
            uint32_t b= AV_RN32(pixels+1);\
1184
            l1=  (a&0x03030303UL)\
1185
               + (b&0x03030303UL);\
1186
            h1= ((a&0xFCFCFCFCUL)>>2)\
1187
              + ((b&0xFCFCFCFCUL)>>2);\
1188
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189
            pixels+=line_size;\
1190
            block +=line_size;\
1191
            a= AV_RN32(pixels  );\
1192
            b= AV_RN32(pixels+1);\
1193
            l0=  (a&0x03030303UL)\
1194
               + (b&0x03030303UL)\
1195
               + 0x02020202UL;\
1196
            h0= ((a&0xFCFCFCFCUL)>>2)\
1197
              + ((b&0xFCFCFCFCUL)>>2);\
1198
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199
            pixels+=line_size;\
1200
            block +=line_size;\
1201
        }\
1202
        pixels+=4-line_size*(h+1);\
1203
        block +=4-line_size*h;\
1204
    }\
1205
}\
1206
\
1207
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208
{\
1209
    int j;\
1210
    for(j=0; j<2; j++){\
1211
        int i;\
1212
        const uint32_t a= AV_RN32(pixels  );\
1213
        const uint32_t b= AV_RN32(pixels+1);\
1214
        uint32_t l0=  (a&0x03030303UL)\
1215
                    + (b&0x03030303UL)\
1216
                    + 0x01010101UL;\
1217
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218
                   + ((b&0xFCFCFCFCUL)>>2);\
1219
        uint32_t l1,h1;\
1220
\
1221
        pixels+=line_size;\
1222
        for(i=0; i<h; i+=2){\
1223
            uint32_t a= AV_RN32(pixels  );\
1224
            uint32_t b= AV_RN32(pixels+1);\
1225
            l1=  (a&0x03030303UL)\
1226
               + (b&0x03030303UL);\
1227
            h1= ((a&0xFCFCFCFCUL)>>2)\
1228
              + ((b&0xFCFCFCFCUL)>>2);\
1229
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230
            pixels+=line_size;\
1231
            block +=line_size;\
1232
            a= AV_RN32(pixels  );\
1233
            b= AV_RN32(pixels+1);\
1234
            l0=  (a&0x03030303UL)\
1235
               + (b&0x03030303UL)\
1236
               + 0x01010101UL;\
1237
            h0= ((a&0xFCFCFCFCUL)>>2)\
1238
              + ((b&0xFCFCFCFCUL)>>2);\
1239
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240
            pixels+=line_size;\
1241
            block +=line_size;\
1242
        }\
1243
        pixels+=4-line_size*(h+1);\
1244
        block +=4-line_size*h;\
1245
    }\
1246
}\
1247
\
1248
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1249
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256

    
1257
#define op_avg(a, b) a = rnd_avg32(a, b)
1258
#endif
1259
#define op_put(a, b) a = b
1260

    
1261
PIXOP2(avg, op_avg)
1262
PIXOP2(put, op_put)
1263
#undef op_avg
1264
#undef op_put
1265

    
1266
#define avg2(a,b) ((a+b+1)>>1)
1267
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268

    
1269
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271
}
1272

    
1273
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275
}
1276

    
1277
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278
{
1279
    const int A=(16-x16)*(16-y16);
1280
    const int B=(   x16)*(16-y16);
1281
    const int C=(16-x16)*(   y16);
1282
    const int D=(   x16)*(   y16);
1283
    int i;
1284

    
1285
    for(i=0; i<h; i++)
1286
    {
1287
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295
        dst+= stride;
1296
        src+= stride;
1297
    }
1298
}
1299

    
1300
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302
{
1303
    int y, vx, vy;
1304
    const int s= 1<<shift;
1305

    
1306
    width--;
1307
    height--;
1308

    
1309
    for(y=0; y<h; y++){
1310
        int x;
1311

    
1312
        vx= ox;
1313
        vy= oy;
1314
        for(x=0; x<8; x++){ //XXX FIXME optimize
1315
            int src_x, src_y, frac_x, frac_y, index;
1316

    
1317
            src_x= vx>>16;
1318
            src_y= vy>>16;
1319
            frac_x= src_x&(s-1);
1320
            frac_y= src_y&(s-1);
1321
            src_x>>=shift;
1322
            src_y>>=shift;
1323

    
1324
            if((unsigned)src_x < width){
1325
                if((unsigned)src_y < height){
1326
                    index= src_x + src_y*stride;
1327
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1328
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1329
                                        + (  src[index+stride  ]*(s-frac_x)
1330
                                           + src[index+stride+1]*   frac_x )*   frac_y
1331
                                        + r)>>(shift*2);
1332
                }else{
1333
                    index= src_x + av_clip(src_y, 0, height)*stride;
1334
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1335
                                          + src[index       +1]*   frac_x )*s
1336
                                        + r)>>(shift*2);
1337
                }
1338
            }else{
1339
                if((unsigned)src_y < height){
1340
                    index= av_clip(src_x, 0, width) + src_y*stride;
1341
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1342
                                           + src[index+stride  ]*   frac_y )*s
1343
                                        + r)>>(shift*2);
1344
                }else{
1345
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346
                    dst[y*stride + x]=    src[index         ];
1347
                }
1348
            }
1349

    
1350
            vx+= dxx;
1351
            vy+= dyx;
1352
        }
1353
        ox += dxy;
1354
        oy += dyy;
1355
    }
1356
}
1357

    
1358
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359
    switch(width){
1360
    case 2: put_pixels2_c (dst, src, stride, height); break;
1361
    case 4: put_pixels4_c (dst, src, stride, height); break;
1362
    case 8: put_pixels8_c (dst, src, stride, height); break;
1363
    case 16:put_pixels16_c(dst, src, stride, height); break;
1364
    }
1365
}
1366

    
1367
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368
    int i,j;
1369
    for (i=0; i < height; i++) {
1370
      for (j=0; j < width; j++) {
1371
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372
      }
1373
      src += stride;
1374
      dst += stride;
1375
    }
1376
}
1377

    
1378
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379
    int i,j;
1380
    for (i=0; i < height; i++) {
1381
      for (j=0; j < width; j++) {
1382
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383
      }
1384
      src += stride;
1385
      dst += stride;
1386
    }
1387
}
1388

    
1389
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390
    int i,j;
1391
    for (i=0; i < height; i++) {
1392
      for (j=0; j < width; j++) {
1393
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394
      }
1395
      src += stride;
1396
      dst += stride;
1397
    }
1398
}
1399

    
1400
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401
    int i,j;
1402
    for (i=0; i < height; i++) {
1403
      for (j=0; j < width; j++) {
1404
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405
      }
1406
      src += stride;
1407
      dst += stride;
1408
    }
1409
}
1410

    
1411
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412
    int i,j;
1413
    for (i=0; i < height; i++) {
1414
      for (j=0; j < width; j++) {
1415
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416
      }
1417
      src += stride;
1418
      dst += stride;
1419
    }
1420
}
1421

    
1422
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423
    int i,j;
1424
    for (i=0; i < height; i++) {
1425
      for (j=0; j < width; j++) {
1426
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427
      }
1428
      src += stride;
1429
      dst += stride;
1430
    }
1431
}
1432

    
1433
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434
    int i,j;
1435
    for (i=0; i < height; i++) {
1436
      for (j=0; j < width; j++) {
1437
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438
      }
1439
      src += stride;
1440
      dst += stride;
1441
    }
1442
}
1443

    
1444
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445
    int i,j;
1446
    for (i=0; i < height; i++) {
1447
      for (j=0; j < width; j++) {
1448
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449
      }
1450
      src += stride;
1451
      dst += stride;
1452
    }
1453
}
1454

    
1455
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456
    switch(width){
1457
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1458
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1459
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1460
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1461
    }
1462
}
1463

    
1464
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465
    int i,j;
1466
    for (i=0; i < height; i++) {
1467
      for (j=0; j < width; j++) {
1468
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469
      }
1470
      src += stride;
1471
      dst += stride;
1472
    }
1473
}
1474

    
1475
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476
    int i,j;
1477
    for (i=0; i < height; i++) {
1478
      for (j=0; j < width; j++) {
1479
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480
      }
1481
      src += stride;
1482
      dst += stride;
1483
    }
1484
}
1485

    
1486
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487
    int i,j;
1488
    for (i=0; i < height; i++) {
1489
      for (j=0; j < width; j++) {
1490
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491
      }
1492
      src += stride;
1493
      dst += stride;
1494
    }
1495
}
1496

    
1497
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498
    int i,j;
1499
    for (i=0; i < height; i++) {
1500
      for (j=0; j < width; j++) {
1501
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502
      }
1503
      src += stride;
1504
      dst += stride;
1505
    }
1506
}
1507

    
1508
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509
    int i,j;
1510
    for (i=0; i < height; i++) {
1511
      for (j=0; j < width; j++) {
1512
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513
      }
1514
      src += stride;
1515
      dst += stride;
1516
    }
1517
}
1518

    
1519
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520
    int i,j;
1521
    for (i=0; i < height; i++) {
1522
      for (j=0; j < width; j++) {
1523
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524
      }
1525
      src += stride;
1526
      dst += stride;
1527
    }
1528
}
1529

    
1530
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531
    int i,j;
1532
    for (i=0; i < height; i++) {
1533
      for (j=0; j < width; j++) {
1534
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535
      }
1536
      src += stride;
1537
      dst += stride;
1538
    }
1539
}
1540

    
1541
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542
    int i,j;
1543
    for (i=0; i < height; i++) {
1544
      for (j=0; j < width; j++) {
1545
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546
      }
1547
      src += stride;
1548
      dst += stride;
1549
    }
1550
}
1551
#if 0
1552
#define TPEL_WIDTH(width)\
1553
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571
#endif
1572

    
1573
#define H264_CHROMA_MC(OPNAME, OP)\
1574
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575
    const int A=(8-x)*(8-y);\
1576
    const int B=(  x)*(8-y);\
1577
    const int C=(8-x)*(  y);\
1578
    const int D=(  x)*(  y);\
1579
    int i;\
1580
    \
1581
    assert(x<8 && y<8 && x>=0 && y>=0);\
1582
\
1583
    if(D){\
1584
        for(i=0; i<h; i++){\
1585
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587
            dst+= stride;\
1588
            src+= stride;\
1589
        }\
1590
    }else{\
1591
        const int E= B+C;\
1592
        const int step= C ? stride : 1;\
1593
        for(i=0; i<h; i++){\
1594
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1595
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1596
            dst+= stride;\
1597
            src+= stride;\
1598
        }\
1599
    }\
1600
}\
1601
\
1602
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603
    const int A=(8-x)*(8-y);\
1604
    const int B=(  x)*(8-y);\
1605
    const int C=(8-x)*(  y);\
1606
    const int D=(  x)*(  y);\
1607
    int i;\
1608
    \
1609
    assert(x<8 && y<8 && x>=0 && y>=0);\
1610
\
1611
    if(D){\
1612
        for(i=0; i<h; i++){\
1613
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617
            dst+= stride;\
1618
            src+= stride;\
1619
        }\
1620
    }else{\
1621
        const int E= B+C;\
1622
        const int step= C ? stride : 1;\
1623
        for(i=0; i<h; i++){\
1624
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1625
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1626
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1627
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1628
            dst+= stride;\
1629
            src+= stride;\
1630
        }\
1631
    }\
1632
}\
1633
\
1634
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635
    const int A=(8-x)*(8-y);\
1636
    const int B=(  x)*(8-y);\
1637
    const int C=(8-x)*(  y);\
1638
    const int D=(  x)*(  y);\
1639
    int i;\
1640
    \
1641
    assert(x<8 && y<8 && x>=0 && y>=0);\
1642
\
1643
    if(D){\
1644
        for(i=0; i<h; i++){\
1645
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653
            dst+= stride;\
1654
            src+= stride;\
1655
        }\
1656
    }else{\
1657
        const int E= B+C;\
1658
        const int step= C ? stride : 1;\
1659
        for(i=0; i<h; i++){\
1660
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1661
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1662
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1663
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1664
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1665
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1666
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1667
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1668
            dst+= stride;\
1669
            src+= stride;\
1670
        }\
1671
    }\
1672
}
1673

    
1674
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675
#define op_put(a, b) a = (((b) + 32)>>6)
1676

    
1677
H264_CHROMA_MC(put_       , op_put)
1678
H264_CHROMA_MC(avg_       , op_avg)
1679
#undef op_avg
1680
#undef op_put
1681

    
1682
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683
    const int A=(8-x)*(8-y);
1684
    const int B=(  x)*(8-y);
1685
    const int C=(8-x)*(  y);
1686
    const int D=(  x)*(  y);
1687
    int i;
1688

    
1689
    assert(x<8 && y<8 && x>=0 && y>=0);
1690

    
1691
    for(i=0; i<h; i++)
1692
    {
1693
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701
        dst+= stride;
1702
        src+= stride;
1703
    }
1704
}
1705

    
1706
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1707
    const int A=(8-x)*(8-y);
1708
    const int B=(  x)*(8-y);
1709
    const int C=(8-x)*(  y);
1710
    const int D=(  x)*(  y);
1711
    int i;
1712

    
1713
    assert(x<8 && y<8 && x>=0 && y>=0);
1714

    
1715
    for(i=0; i<h; i++)
1716
    {
1717
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1718
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1719
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1720
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1721
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1722
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1723
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1724
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1725
        dst+= stride;
1726
        src+= stride;
1727
    }
1728
}
1729

    
1730
#define QPEL_MC(r, OPNAME, RND, OP) \
1731
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1732
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1733
    int i;\
1734
    for(i=0; i<h; i++)\
1735
    {\
1736
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1737
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1738
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1739
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1740
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1741
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1742
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1743
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1744
        dst+=dstStride;\
1745
        src+=srcStride;\
1746
    }\
1747
}\
1748
\
1749
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1750
    const int w=8;\
1751
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1752
    int i;\
1753
    for(i=0; i<w; i++)\
1754
    {\
1755
        const int src0= src[0*srcStride];\
1756
        const int src1= src[1*srcStride];\
1757
        const int src2= src[2*srcStride];\
1758
        const int src3= src[3*srcStride];\
1759
        const int src4= src[4*srcStride];\
1760
        const int src5= src[5*srcStride];\
1761
        const int src6= src[6*srcStride];\
1762
        const int src7= src[7*srcStride];\
1763
        const int src8= src[8*srcStride];\
1764
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1765
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1766
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1767
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1768
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1769
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1770
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1771
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1772
        dst++;\
1773
        src++;\
1774
    }\
1775
}\
1776
\
1777
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1778
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1779
    int i;\
1780
    \
1781
    for(i=0; i<h; i++)\
1782
    {\
1783
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1784
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1785
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1786
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1787
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1788
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1789
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1790
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1791
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1792
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1793
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1794
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1795
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1796
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1797
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1798
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1799
        dst+=dstStride;\
1800
        src+=srcStride;\
1801
    }\
1802
}\
1803
\
1804
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1805
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1806
    int i;\
1807
    const int w=16;\
1808
    for(i=0; i<w; i++)\
1809
    {\
1810
        const int src0= src[0*srcStride];\
1811
        const int src1= src[1*srcStride];\
1812
        const int src2= src[2*srcStride];\
1813
        const int src3= src[3*srcStride];\
1814
        const int src4= src[4*srcStride];\
1815
        const int src5= src[5*srcStride];\
1816
        const int src6= src[6*srcStride];\
1817
        const int src7= src[7*srcStride];\
1818
        const int src8= src[8*srcStride];\
1819
        const int src9= src[9*srcStride];\
1820
        const int src10= src[10*srcStride];\
1821
        const int src11= src[11*srcStride];\
1822
        const int src12= src[12*srcStride];\
1823
        const int src13= src[13*srcStride];\
1824
        const int src14= src[14*srcStride];\
1825
        const int src15= src[15*srcStride];\
1826
        const int src16= src[16*srcStride];\
1827
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1828
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1829
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1830
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1831
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1832
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1833
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1834
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1835
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1836
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1837
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1838
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1839
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1840
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1841
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1842
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1843
        dst++;\
1844
        src++;\
1845
    }\
1846
}\
1847
\
1848
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1849
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1850
}\
1851
\
1852
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1853
    uint8_t half[64];\
1854
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1855
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1856
}\
1857
\
1858
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1859
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1860
}\
1861
\
1862
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1863
    uint8_t half[64];\
1864
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1865
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1866
}\
1867
\
1868
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1869
    uint8_t full[16*9];\
1870
    uint8_t half[64];\
1871
    copy_block9(full, src, 16, stride, 9);\
1872
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1873
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1874
}\
1875
\
1876
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1877
    uint8_t full[16*9];\
1878
    copy_block9(full, src, 16, stride, 9);\
1879
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1880
}\
1881
\
1882
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1883
    uint8_t full[16*9];\
1884
    uint8_t half[64];\
1885
    copy_block9(full, src, 16, stride, 9);\
1886
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1887
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1888
}\
1889
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1890
    uint8_t full[16*9];\
1891
    uint8_t halfH[72];\
1892
    uint8_t halfV[64];\
1893
    uint8_t halfHV[64];\
1894
    copy_block9(full, src, 16, stride, 9);\
1895
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1896
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1897
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1898
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1899
}\
1900
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1901
    uint8_t full[16*9];\
1902
    uint8_t halfH[72];\
1903
    uint8_t halfHV[64];\
1904
    copy_block9(full, src, 16, stride, 9);\
1905
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1907
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1909
}\
1910
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1911
    uint8_t full[16*9];\
1912
    uint8_t halfH[72];\
1913
    uint8_t halfV[64];\
1914
    uint8_t halfHV[64];\
1915
    copy_block9(full, src, 16, stride, 9);\
1916
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1917
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1918
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1919
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1920
}\
1921
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1922
    uint8_t full[16*9];\
1923
    uint8_t halfH[72];\
1924
    uint8_t halfHV[64];\
1925
    copy_block9(full, src, 16, stride, 9);\
1926
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1927
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1928
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1929
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1930
}\
1931
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1932
    uint8_t full[16*9];\
1933
    uint8_t halfH[72];\
1934
    uint8_t halfV[64];\
1935
    uint8_t halfHV[64];\
1936
    copy_block9(full, src, 16, stride, 9);\
1937
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1938
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1939
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1940
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1941
}\
1942
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1943
    uint8_t full[16*9];\
1944
    uint8_t halfH[72];\
1945
    uint8_t halfHV[64];\
1946
    copy_block9(full, src, 16, stride, 9);\
1947
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1948
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1949
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1951
}\
1952
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1953
    uint8_t full[16*9];\
1954
    uint8_t halfH[72];\
1955
    uint8_t halfV[64];\
1956
    uint8_t halfHV[64];\
1957
    copy_block9(full, src, 16, stride, 9);\
1958
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1959
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1960
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1962
}\
1963
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1964
    uint8_t full[16*9];\
1965
    uint8_t halfH[72];\
1966
    uint8_t halfHV[64];\
1967
    copy_block9(full, src, 16, stride, 9);\
1968
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1970
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1972
}\
1973
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1974
    uint8_t halfH[72];\
1975
    uint8_t halfHV[64];\
1976
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1978
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1979
}\
1980
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1981
    uint8_t halfH[72];\
1982
    uint8_t halfHV[64];\
1983
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1984
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1985
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1986
}\
1987
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1988
    uint8_t full[16*9];\
1989
    uint8_t halfH[72];\
1990
    uint8_t halfV[64];\
1991
    uint8_t halfHV[64];\
1992
    copy_block9(full, src, 16, stride, 9);\
1993
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1995
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1996
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1997
}\
1998
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1999
    uint8_t full[16*9];\
2000
    uint8_t halfH[72];\
2001
    copy_block9(full, src, 16, stride, 9);\
2002
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2004
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005
}\
2006
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t full[16*9];\
2008
    uint8_t halfH[72];\
2009
    uint8_t halfV[64];\
2010
    uint8_t halfHV[64];\
2011
    copy_block9(full, src, 16, stride, 9);\
2012
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2014
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2016
}\
2017
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2018
    uint8_t full[16*9];\
2019
    uint8_t halfH[72];\
2020
    copy_block9(full, src, 16, stride, 9);\
2021
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2022
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2023
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2024
}\
2025
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t halfH[72];\
2027
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2028
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2029
}\
2030
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2031
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2032
}\
2033
\
2034
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2035
    uint8_t half[256];\
2036
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2037
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2038
}\
2039
\
2040
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2041
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2042
}\
2043
\
2044
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2045
    uint8_t half[256];\
2046
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2047
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2048
}\
2049
\
2050
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2051
    uint8_t full[24*17];\
2052
    uint8_t half[256];\
2053
    copy_block17(full, src, 24, stride, 17);\
2054
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2055
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2056
}\
2057
\
2058
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2059
    uint8_t full[24*17];\
2060
    copy_block17(full, src, 24, stride, 17);\
2061
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2062
}\
2063
\
2064
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2065
    uint8_t full[24*17];\
2066
    uint8_t half[256];\
2067
    copy_block17(full, src, 24, stride, 17);\
2068
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2069
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2070
}\
2071
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072
    uint8_t full[24*17];\
2073
    uint8_t halfH[272];\
2074
    uint8_t halfV[256];\
2075
    uint8_t halfHV[256];\
2076
    copy_block17(full, src, 24, stride, 17);\
2077
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2078
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2079
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2080
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2081
}\
2082
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2083
    uint8_t full[24*17];\
2084
    uint8_t halfH[272];\
2085
    uint8_t halfHV[256];\
2086
    copy_block17(full, src, 24, stride, 17);\
2087
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2089
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2091
}\
2092
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2093
    uint8_t full[24*17];\
2094
    uint8_t halfH[272];\
2095
    uint8_t halfV[256];\
2096
    uint8_t halfHV[256];\
2097
    copy_block17(full, src, 24, stride, 17);\
2098
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2100
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2101
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2102
}\
2103
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2104
    uint8_t full[24*17];\
2105
    uint8_t halfH[272];\
2106
    uint8_t halfHV[256];\
2107
    copy_block17(full, src, 24, stride, 17);\
2108
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2109
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2110
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2111
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2112
}\
2113
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2114
    uint8_t full[24*17];\
2115
    uint8_t halfH[272];\
2116
    uint8_t halfV[256];\
2117
    uint8_t halfHV[256];\
2118
    copy_block17(full, src, 24, stride, 17);\
2119
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2120
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2121
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2122
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2123
}\
2124
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2125
    uint8_t full[24*17];\
2126
    uint8_t halfH[272];\
2127
    uint8_t halfHV[256];\
2128
    copy_block17(full, src, 24, stride, 17);\
2129
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2130
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2131
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2133
}\
2134
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2135
    uint8_t full[24*17];\
2136
    uint8_t halfH[272];\
2137
    uint8_t halfV[256];\
2138
    uint8_t halfHV[256];\
2139
    copy_block17(full, src, 24, stride, 17);\
2140
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2141
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2142
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2144
}\
2145
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2146
    uint8_t full[24*17];\
2147
    uint8_t halfH[272];\
2148
    uint8_t halfHV[256];\
2149
    copy_block17(full, src, 24, stride, 17);\
2150
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2152
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2154
}\
2155
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2156
    uint8_t halfH[272];\
2157
    uint8_t halfHV[256];\
2158
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2160
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2161
}\
2162
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2163
    uint8_t halfH[272];\
2164
    uint8_t halfHV[256];\
2165
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2166
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2167
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2168
}\
2169
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2170
    uint8_t full[24*17];\
2171
    uint8_t halfH[272];\
2172
    uint8_t halfV[256];\
2173
    uint8_t halfHV[256];\
2174
    copy_block17(full, src, 24, stride, 17);\
2175
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2177
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2178
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2179
}\
2180
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2181
    uint8_t full[24*17];\
2182
    uint8_t halfH[272];\
2183
    copy_block17(full, src, 24, stride, 17);\
2184
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2186
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187
}\
2188
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2189
    uint8_t full[24*17];\
2190
    uint8_t halfH[272];\
2191
    uint8_t halfV[256];\
2192
    uint8_t halfHV[256];\
2193
    copy_block17(full, src, 24, stride, 17);\
2194
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2196
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2198
}\
2199
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2200
    uint8_t full[24*17];\
2201
    uint8_t halfH[272];\
2202
    copy_block17(full, src, 24, stride, 17);\
2203
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2204
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2205
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2206
}\
2207
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2208
    uint8_t halfH[272];\
2209
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2210
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2211
}
2212

    
2213
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2214
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2215
#define op_put(a, b) a = cm[((b) + 16)>>5]
2216
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2217

    
2218
QPEL_MC(0, put_       , _       , op_put)
2219
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2220
QPEL_MC(0, avg_       , _       , op_avg)
2221
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2222
#undef op_avg
2223
#undef op_avg_no_rnd
2224
#undef op_put
2225
#undef op_put_no_rnd
2226

    
2227
#if 1
2228
#define H264_LOWPASS(OPNAME, OP, OP2) \
2229
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230
    const int h=2;\
2231
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2232
    int i;\
2233
    for(i=0; i<h; i++)\
2234
    {\
2235
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2236
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2237
        dst+=dstStride;\
2238
        src+=srcStride;\
2239
    }\
2240
}\
2241
\
2242
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2243
    const int w=2;\
2244
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245
    int i;\
2246
    for(i=0; i<w; i++)\
2247
    {\
2248
        const int srcB= src[-2*srcStride];\
2249
        const int srcA= src[-1*srcStride];\
2250
        const int src0= src[0 *srcStride];\
2251
        const int src1= src[1 *srcStride];\
2252
        const int src2= src[2 *srcStride];\
2253
        const int src3= src[3 *srcStride];\
2254
        const int src4= src[4 *srcStride];\
2255
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2256
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2257
        dst++;\
2258
        src++;\
2259
    }\
2260
}\
2261
\
2262
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2263
    const int h=2;\
2264
    const int w=2;\
2265
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266
    int i;\
2267
    src -= 2*srcStride;\
2268
    for(i=0; i<h+5; i++)\
2269
    {\
2270
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2271
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2272
        tmp+=tmpStride;\
2273
        src+=srcStride;\
2274
    }\
2275
    tmp -= tmpStride*(h+5-2);\
2276
    for(i=0; i<w; i++)\
2277
    {\
2278
        const int tmpB= tmp[-2*tmpStride];\
2279
        const int tmpA= tmp[-1*tmpStride];\
2280
        const int tmp0= tmp[0 *tmpStride];\
2281
        const int tmp1= tmp[1 *tmpStride];\
2282
        const int tmp2= tmp[2 *tmpStride];\
2283
        const int tmp3= tmp[3 *tmpStride];\
2284
        const int tmp4= tmp[4 *tmpStride];\
2285
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2286
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2287
        dst++;\
2288
        tmp++;\
2289
    }\
2290
}\
2291
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2292
    const int h=4;\
2293
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294
    int i;\
2295
    for(i=0; i<h; i++)\
2296
    {\
2297
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2298
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2299
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2300
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2301
        dst+=dstStride;\
2302
        src+=srcStride;\
2303
    }\
2304
}\
2305
\
2306
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2307
    const int w=4;\
2308
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309
    int i;\
2310
    for(i=0; i<w; i++)\
2311
    {\
2312
        const int srcB= src[-2*srcStride];\
2313
        const int srcA= src[-1*srcStride];\
2314
        const int src0= src[0 *srcStride];\
2315
        const int src1= src[1 *srcStride];\
2316
        const int src2= src[2 *srcStride];\
2317
        const int src3= src[3 *srcStride];\
2318
        const int src4= src[4 *srcStride];\
2319
        const int src5= src[5 *srcStride];\
2320
        const int src6= src[6 *srcStride];\
2321
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2322
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2323
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2324
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2325
        dst++;\
2326
        src++;\
2327
    }\
2328
}\
2329
\
2330
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2331
    const int h=4;\
2332
    const int w=4;\
2333
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2334
    int i;\
2335
    src -= 2*srcStride;\
2336
    for(i=0; i<h+5; i++)\
2337
    {\
2338
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2339
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2340
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2341
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2342
        tmp+=tmpStride;\
2343
        src+=srcStride;\
2344
    }\
2345
    tmp -= tmpStride*(h+5-2);\
2346
    for(i=0; i<w; i++)\
2347
    {\
2348
        const int tmpB= tmp[-2*tmpStride];\
2349
        const int tmpA= tmp[-1*tmpStride];\
2350
        const int tmp0= tmp[0 *tmpStride];\
2351
        const int tmp1= tmp[1 *tmpStride];\
2352
        const int tmp2= tmp[2 *tmpStride];\
2353
        const int tmp3= tmp[3 *tmpStride];\
2354
        const int tmp4= tmp[4 *tmpStride];\
2355
        const int tmp5= tmp[5 *tmpStride];\
2356
        const int tmp6= tmp[6 *tmpStride];\
2357
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2358
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2359
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2360
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2361
        dst++;\
2362
        tmp++;\
2363
    }\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2367
    const int h=8;\
2368
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2369
    int i;\
2370
    for(i=0; i<h; i++)\
2371
    {\
2372
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2373
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2374
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2375
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2376
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2377
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2378
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2379
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2380
        dst+=dstStride;\
2381
        src+=srcStride;\
2382
    }\
2383
}\
2384
\
2385
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2386
    const int w=8;\
2387
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2388
    int i;\
2389
    for(i=0; i<w; i++)\
2390
    {\
2391
        const int srcB= src[-2*srcStride];\
2392
        const int srcA= src[-1*srcStride];\
2393
        const int src0= src[0 *srcStride];\
2394
        const int src1= src[1 *srcStride];\
2395
        const int src2= src[2 *srcStride];\
2396
        const int src3= src[3 *srcStride];\
2397
        const int src4= src[4 *srcStride];\
2398
        const int src5= src[5 *srcStride];\
2399
        const int src6= src[6 *srcStride];\
2400
        const int src7= src[7 *srcStride];\
2401
        const int src8= src[8 *srcStride];\
2402
        const int src9= src[9 *srcStride];\
2403
        const int src10=src[10*srcStride];\
2404
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2405
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2406
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2407
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2408
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2409
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2410
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2411
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2412
        dst++;\
2413
        src++;\
2414
    }\
2415
}\
2416
\
2417
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2418
    const int h=8;\
2419
    const int w=8;\
2420
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2421
    int i;\
2422
    src -= 2*srcStride;\
2423
    for(i=0; i<h+5; i++)\
2424
    {\
2425
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2426
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2427
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2428
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2429
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2430
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2431
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2432
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2433
        tmp+=tmpStride;\
2434
        src+=srcStride;\
2435
    }\
2436
    tmp -= tmpStride*(h+5-2);\
2437
    for(i=0; i<w; i++)\
2438
    {\
2439
        const int tmpB= tmp[-2*tmpStride];\
2440
        const int tmpA= tmp[-1*tmpStride];\
2441
        const int tmp0= tmp[0 *tmpStride];\
2442
        const int tmp1= tmp[1 *tmpStride];\
2443
        const int tmp2= tmp[2 *tmpStride];\
2444
        const int tmp3= tmp[3 *tmpStride];\
2445
        const int tmp4= tmp[4 *tmpStride];\
2446
        const int tmp5= tmp[5 *tmpStride];\
2447
        const int tmp6= tmp[6 *tmpStride];\
2448
        const int tmp7= tmp[7 *tmpStride];\
2449
        const int tmp8= tmp[8 *tmpStride];\
2450
        const int tmp9= tmp[9 *tmpStride];\
2451
        const int tmp10=tmp[10*tmpStride];\
2452
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2453
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2454
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2455
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2456
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2457
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2458
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2459
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2460
        dst++;\
2461
        tmp++;\
2462
    }\
2463
}\
2464
\
2465
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2466
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2467
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2468
    src += 8*srcStride;\
2469
    dst += 8*dstStride;\
2470
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2471
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2472
}\
2473
\
2474
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2475
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2476
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2477
    src += 8*srcStride;\
2478
    dst += 8*dstStride;\
2479
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2480
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2481
}\
2482
\
2483
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2484
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2485
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2486
    src += 8*srcStride;\
2487
    dst += 8*dstStride;\
2488
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2489
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2490
}\
2491

    
2492
#define H264_MC(OPNAME, SIZE) \
2493
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2494
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2495
}\
2496
\
2497
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2498
    uint8_t half[SIZE*SIZE];\
2499
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2500
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2501
}\
2502
\
2503
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2504
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2505
}\
2506
\
2507
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2508
    uint8_t half[SIZE*SIZE];\
2509
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2510
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2511
}\
2512
\
2513
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2514
    uint8_t full[SIZE*(SIZE+5)];\
2515
    uint8_t * const full_mid= full + SIZE*2;\
2516
    uint8_t half[SIZE*SIZE];\
2517
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2518
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2519
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2520
}\
2521
\
2522
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2523
    uint8_t full[SIZE*(SIZE+5)];\
2524
    uint8_t * const full_mid= full + SIZE*2;\
2525
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2526
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2527
}\
2528
\
2529
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2530
    uint8_t full[SIZE*(SIZE+5)];\
2531
    uint8_t * const full_mid= full + SIZE*2;\
2532
    uint8_t half[SIZE*SIZE];\
2533
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2534
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2535
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2536
}\
2537
\
2538
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2539
    uint8_t full[SIZE*(SIZE+5)];\
2540
    uint8_t * const full_mid= full + SIZE*2;\
2541
    uint8_t halfH[SIZE*SIZE];\
2542
    uint8_t halfV[SIZE*SIZE];\
2543
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2544
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2545
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2547
}\
2548
\
2549
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2550
    uint8_t full[SIZE*(SIZE+5)];\
2551
    uint8_t * const full_mid= full + SIZE*2;\
2552
    uint8_t halfH[SIZE*SIZE];\
2553
    uint8_t halfV[SIZE*SIZE];\
2554
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2555
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2556
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2557
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2558
}\
2559
\
2560
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2561
    uint8_t full[SIZE*(SIZE+5)];\
2562
    uint8_t * const full_mid= full + SIZE*2;\
2563
    uint8_t halfH[SIZE*SIZE];\
2564
    uint8_t halfV[SIZE*SIZE];\
2565
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2566
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2567
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2568
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2569
}\
2570
\
2571
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2572
    uint8_t full[SIZE*(SIZE+5)];\
2573
    uint8_t * const full_mid= full + SIZE*2;\
2574
    uint8_t halfH[SIZE*SIZE];\
2575
    uint8_t halfV[SIZE*SIZE];\
2576
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2578
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2579
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2580
}\
2581
\
2582
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2583
    int16_t tmp[SIZE*(SIZE+5)];\
2584
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2585
}\
2586
\
2587
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2588
    int16_t tmp[SIZE*(SIZE+5)];\
2589
    uint8_t halfH[SIZE*SIZE];\
2590
    uint8_t halfHV[SIZE*SIZE];\
2591
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2592
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2593
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2594
}\
2595
\
2596
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2597
    int16_t tmp[SIZE*(SIZE+5)];\
2598
    uint8_t halfH[SIZE*SIZE];\
2599
    uint8_t halfHV[SIZE*SIZE];\
2600
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2601
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2603
}\
2604
\
2605
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2606
    uint8_t full[SIZE*(SIZE+5)];\
2607
    uint8_t * const full_mid= full + SIZE*2;\
2608
    int16_t tmp[SIZE*(SIZE+5)];\
2609
    uint8_t halfV[SIZE*SIZE];\
2610
    uint8_t halfHV[SIZE*SIZE];\
2611
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2612
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2613
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2614
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2615
}\
2616
\
2617
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2618
    uint8_t full[SIZE*(SIZE+5)];\
2619
    uint8_t * const full_mid= full + SIZE*2;\
2620
    int16_t tmp[SIZE*(SIZE+5)];\
2621
    uint8_t halfV[SIZE*SIZE];\
2622
    uint8_t halfHV[SIZE*SIZE];\
2623
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2624
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2626
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2627
}\
2628

    
2629
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2630
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2631
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2632
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2633
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2634

    
2635
H264_LOWPASS(put_       , op_put, op2_put)
2636
H264_LOWPASS(avg_       , op_avg, op2_avg)
2637
H264_MC(put_, 2)
2638
H264_MC(put_, 4)
2639
H264_MC(put_, 8)
2640
H264_MC(put_, 16)
2641
H264_MC(avg_, 4)
2642
H264_MC(avg_, 8)
2643
H264_MC(avg_, 16)
2644

    
2645
#undef op_avg
2646
#undef op_put
2647
#undef op2_avg
2648
#undef op2_put
2649
#endif
2650

    
2651
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2652
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2653
#define H264_WEIGHT(W,H) \
2654
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2655
    int y; \
2656
    offset <<= log2_denom; \
2657
    if(log2_denom) offset += 1<<(log2_denom-1); \
2658
    for(y=0; y<H; y++, block += stride){ \
2659
        op_scale1(0); \
2660
        op_scale1(1); \
2661
        if(W==2) continue; \
2662
        op_scale1(2); \
2663
        op_scale1(3); \
2664
        if(W==4) continue; \
2665
        op_scale1(4); \
2666
        op_scale1(5); \
2667
        op_scale1(6); \
2668
        op_scale1(7); \
2669
        if(W==8) continue; \
2670
        op_scale1(8); \
2671
        op_scale1(9); \
2672
        op_scale1(10); \
2673
        op_scale1(11); \
2674
        op_scale1(12); \
2675
        op_scale1(13); \
2676
        op_scale1(14); \
2677
        op_scale1(15); \
2678
    } \
2679
} \
2680
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2681
    int y; \
2682
    offset = ((offset + 1) | 1) << log2_denom; \
2683
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2684
        op_scale2(0); \
2685
        op_scale2(1); \
2686
        if(W==2) continue; \
2687
        op_scale2(2); \
2688
        op_scale2(3); \
2689
        if(W==4) continue; \
2690
        op_scale2(4); \
2691
        op_scale2(5); \
2692
        op_scale2(6); \
2693
        op_scale2(7); \
2694
        if(W==8) continue; \
2695
        op_scale2(8); \
2696
        op_scale2(9); \
2697
        op_scale2(10); \
2698
        op_scale2(11); \
2699
        op_scale2(12); \
2700
        op_scale2(13); \
2701
        op_scale2(14); \
2702
        op_scale2(15); \
2703
    } \
2704
}
2705

    
2706
H264_WEIGHT(16,16)
2707
H264_WEIGHT(16,8)
2708
H264_WEIGHT(8,16)
2709
H264_WEIGHT(8,8)
2710
H264_WEIGHT(8,4)
2711
H264_WEIGHT(4,8)
2712
H264_WEIGHT(4,4)
2713
H264_WEIGHT(4,2)
2714
H264_WEIGHT(2,4)
2715
H264_WEIGHT(2,2)
2716

    
2717
#undef op_scale1
2718
#undef op_scale2
2719
#undef H264_WEIGHT
2720

    
2721
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2722
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2723
    int i;
2724

    
2725
    for(i=0; i<h; i++){
2726
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2727
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2728
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2729
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2730
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2731
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2732
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2733
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2734
        dst+=dstStride;
2735
        src+=srcStride;
2736
    }
2737
}
2738

    
2739
#if CONFIG_CAVS_DECODER
2740
/* AVS specific */
2741
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2742

    
2743
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2744
    put_pixels8_c(dst, src, stride, 8);
2745
}
2746
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747
    avg_pixels8_c(dst, src, stride, 8);
2748
}
2749
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750
    put_pixels16_c(dst, src, stride, 16);
2751
}
2752
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753
    avg_pixels16_c(dst, src, stride, 16);
2754
}
2755
#endif /* CONFIG_CAVS_DECODER */
2756

    
2757
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2758

    
2759
#if CONFIG_VC1_DECODER
2760
/* VC-1 specific */
2761
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2762

    
2763
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2764
    put_pixels8_c(dst, src, stride, 8);
2765
}
2766
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767
    avg_pixels8_c(dst, src, stride, 8);
2768
}
2769
#endif /* CONFIG_VC1_DECODER */
2770

    
2771
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2772

    
2773
/* H264 specific */
2774
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2775

    
2776
#if CONFIG_RV30_DECODER
2777
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2778
#endif /* CONFIG_RV30_DECODER */
2779

    
2780
#if CONFIG_RV40_DECODER
2781
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2782
    put_pixels16_xy2_c(dst, src, stride, 16);
2783
}
2784
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785
    avg_pixels16_xy2_c(dst, src, stride, 16);
2786
}
2787
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788
    put_pixels8_xy2_c(dst, src, stride, 8);
2789
}
2790
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791
    avg_pixels8_xy2_c(dst, src, stride, 8);
2792
}
2793

    
2794
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2795
#endif /* CONFIG_RV40_DECODER */
2796

    
2797
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2798
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2799
    int i;
2800

    
2801
    for(i=0; i<w; i++){
2802
        const int src_1= src[ -srcStride];
2803
        const int src0 = src[0          ];
2804
        const int src1 = src[  srcStride];
2805
        const int src2 = src[2*srcStride];
2806
        const int src3 = src[3*srcStride];
2807
        const int src4 = src[4*srcStride];
2808
        const int src5 = src[5*srcStride];
2809
        const int src6 = src[6*srcStride];
2810
        const int src7 = src[7*srcStride];
2811
        const int src8 = src[8*srcStride];
2812
        const int src9 = src[9*srcStride];
2813
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2814
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2815
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2816
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2817
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2818
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2819
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2820
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2821
        src++;
2822
        dst++;
2823
    }
2824
}
2825

    
2826
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2827
    put_pixels8_c(dst, src, stride, 8);
2828
}
2829

    
2830
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2831
    uint8_t half[64];
2832
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2833
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2834
}
2835

    
2836
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2837
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2838
}
2839

    
2840
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2841
    uint8_t half[64];
2842
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2843
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2844
}
2845

    
2846
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2847
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2848
}
2849

    
2850
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2851
    uint8_t halfH[88];
2852
    uint8_t halfV[64];
2853
    uint8_t halfHV[64];
2854
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2855
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2856
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2857
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2858
}
2859
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2860
    uint8_t halfH[88];
2861
    uint8_t halfV[64];
2862
    uint8_t halfHV[64];
2863
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2864
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2865
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2866
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2867
}
2868
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2869
    uint8_t halfH[88];
2870
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2871
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2872
}
2873

    
2874
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2875
    if(CONFIG_ANY_H263) {
2876
    int x;
2877
    const int strength= ff_h263_loop_filter_strength[qscale];
2878

    
2879
    for(x=0; x<8; x++){
2880
        int d1, d2, ad1;
2881
        int p0= src[x-2*stride];
2882
        int p1= src[x-1*stride];
2883
        int p2= src[x+0*stride];
2884
        int p3= src[x+1*stride];
2885
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2886

    
2887
        if     (d<-2*strength) d1= 0;
2888
        else if(d<-  strength) d1=-2*strength - d;
2889
        else if(d<   strength) d1= d;
2890
        else if(d< 2*strength) d1= 2*strength - d;
2891
        else                   d1= 0;
2892

    
2893
        p1 += d1;
2894
        p2 -= d1;
2895
        if(p1&256) p1= ~(p1>>31);
2896
        if(p2&256) p2= ~(p2>>31);
2897

    
2898
        src[x-1*stride] = p1;
2899
        src[x+0*stride] = p2;
2900

    
2901
        ad1= FFABS(d1)>>1;
2902

    
2903
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2904

    
2905
        src[x-2*stride] = p0 - d2;
2906
        src[x+  stride] = p3 + d2;
2907
    }
2908
    }
2909
}
2910

    
2911
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2912
    if(CONFIG_ANY_H263) {
2913
    int y;
2914
    const int strength= ff_h263_loop_filter_strength[qscale];
2915

    
2916
    for(y=0; y<8; y++){
2917
        int d1, d2, ad1;
2918
        int p0= src[y*stride-2];
2919
        int p1= src[y*stride-1];
2920
        int p2= src[y*stride+0];
2921
        int p3= src[y*stride+1];
2922
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2923

    
2924
        if     (d<-2*strength) d1= 0;
2925
        else if(d<-  strength) d1=-2*strength - d;
2926
        else if(d<   strength) d1= d;
2927
        else if(d< 2*strength) d1= 2*strength - d;
2928
        else                   d1= 0;
2929

    
2930
        p1 += d1;
2931
        p2 -= d1;
2932
        if(p1&256) p1= ~(p1>>31);
2933
        if(p2&256) p2= ~(p2>>31);
2934

    
2935
        src[y*stride-1] = p1;
2936
        src[y*stride+0] = p2;
2937

    
2938
        ad1= FFABS(d1)>>1;
2939

    
2940
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2941

    
2942
        src[y*stride-2] = p0 - d2;
2943
        src[y*stride+1] = p3 + d2;
2944
    }
2945
    }
2946
}
2947

    
2948
static void h261_loop_filter_c(uint8_t *src, int stride){
2949
    int x,y,xy,yz;
2950
    int temp[64];
2951

    
2952
    for(x=0; x<8; x++){
2953
        temp[x      ] = 4*src[x           ];
2954
        temp[x + 7*8] = 4*src[x + 7*stride];
2955
    }
2956
    for(y=1; y<7; y++){
2957
        for(x=0; x<8; x++){
2958
            xy = y * stride + x;
2959
            yz = y * 8 + x;
2960
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2961
        }
2962
    }
2963

    
2964
    for(y=0; y<8; y++){
2965
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2966
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2967
        for(x=1; x<7; x++){
2968
            xy = y * stride + x;
2969
            yz = y * 8 + x;
2970
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2971
        }
2972
    }
2973
}
2974

    
2975
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2976
{
2977
    int i, d;
2978
    for( i = 0; i < 4; i++ ) {
2979
        if( tc0[i] < 0 ) {
2980
            pix += 4*ystride;
2981
            continue;
2982
        }
2983
        for( d = 0; d < 4; d++ ) {
2984
            const int p0 = pix[-1*xstride];
2985
            const int p1 = pix[-2*xstride];
2986
            const int p2 = pix[-3*xstride];
2987
            const int q0 = pix[0];
2988
            const int q1 = pix[1*xstride];
2989
            const int q2 = pix[2*xstride];
2990

    
2991
            if( FFABS( p0 - q0 ) < alpha &&
2992
                FFABS( p1 - p0 ) < beta &&
2993
                FFABS( q1 - q0 ) < beta ) {
2994

    
2995
                int tc = tc0[i];
2996
                int i_delta;
2997

    
2998
                if( FFABS( p2 - p0 ) < beta ) {
2999
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3000
                    tc++;
3001
                }
3002
                if( FFABS( q2 - q0 ) < beta ) {
3003
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3004
                    tc++;
3005
                }
3006

    
3007
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3008
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3009
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3010
            }
3011
            pix += ystride;
3012
        }
3013
    }
3014
}
3015
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3016
{
3017
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3018
}
3019
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3020
{
3021
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3022
}
3023

    
3024
static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3025
{
3026
    int d;
3027
    for( d = 0; d < 16; d++ ) {
3028
        const int p2 = pix[-3*xstride];
3029
        const int p1 = pix[-2*xstride];
3030
        const int p0 = pix[-1*xstride];
3031

    
3032
        const int q0 = pix[ 0*xstride];
3033
        const int q1 = pix[ 1*xstride];
3034
        const int q2 = pix[ 2*xstride];
3035

    
3036
        if( FFABS( p0 - q0 ) < alpha &&
3037
            FFABS( p1 - p0 ) < beta &&
3038
            FFABS( q1 - q0 ) < beta ) {
3039

    
3040
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3041
                if( FFABS( p2 - p0 ) < beta)
3042
                {
3043
                    const int p3 = pix[-4*xstride];
3044
                    /* p0', p1', p2' */
3045
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3046
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3047
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3048
                } else {
3049
                    /* p0' */
3050
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3051
                }
3052
                if( FFABS( q2 - q0 ) < beta)
3053
                {
3054
                    const int q3 = pix[3*xstride];
3055
                    /* q0', q1', q2' */
3056
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3057
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3058
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3059
                } else {
3060
                    /* q0' */
3061
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3062
                }
3063
            }else{
3064
                /* p0', q0' */
3065
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3066
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3067
            }
3068
        }
3069
        pix += ystride;
3070
    }
3071
}
3072
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3073
{
3074
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3075
}
3076
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3077
{
3078
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3079
}
3080

    
3081
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3082
{
3083
    int i, d;
3084
    for( i = 0; i < 4; i++ ) {
3085
        const int tc = tc0[i];
3086
        if( tc <= 0 ) {
3087
            pix += 2*ystride;
3088
            continue;
3089
        }
3090
        for( d = 0; d < 2; d++ ) {
3091
            const int p0 = pix[-1*xstride];
3092
            const int p1 = pix[-2*xstride];
3093
            const int q0 = pix[0];
3094
            const int q1 = pix[1*xstride];
3095

    
3096
            if( FFABS( p0 - q0 ) < alpha &&
3097
                FFABS( p1 - p0 ) < beta &&
3098
                FFABS( q1 - q0 ) < beta ) {
3099

    
3100
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3101

    
3102
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3103
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3104
            }
3105
            pix += ystride;
3106
        }
3107
    }
3108
}
3109
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3110
{
3111
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3112
}
3113
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3114
{
3115
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3116
}
3117

    
3118
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3119
{
3120
    int d;
3121
    for( d = 0; d < 8; d++ ) {
3122
        const int p0 = pix[-1*xstride];
3123
        const int p1 = pix[-2*xstride];
3124
        const int q0 = pix[0];
3125
        const int q1 = pix[1*xstride];
3126

    
3127
        if( FFABS( p0 - q0 ) < alpha &&
3128
            FFABS( p1 - p0 ) < beta &&
3129
            FFABS( q1 - q0 ) < beta ) {
3130

    
3131
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3132
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3133
        }
3134
        pix += ystride;
3135
    }
3136
}
3137
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3138
{
3139
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3140
}
3141
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3142
{
3143
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3144
}
3145

    
3146
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3147
{
3148
    int s, i;
3149

    
3150
    s = 0;
3151
    for(i=0;i<h;i++) {
3152
        s += abs(pix1[0] - pix2[0]);
3153
        s += abs(pix1[1] - pix2[1]);
3154
        s += abs(pix1[2] - pix2[2]);
3155
        s += abs(pix1[3] - pix2[3]);
3156
        s += abs(pix1[4] - pix2[4]);
3157
        s += abs(pix1[5] - pix2[5]);
3158
        s += abs(pix1[6] - pix2[6]);
3159
        s += abs(pix1[7] - pix2[7]);
3160
        s += abs(pix1[8] - pix2[8]);
3161
        s += abs(pix1[9] - pix2[9]);
3162
        s += abs(pix1[10] - pix2[10]);
3163
        s += abs(pix1[11] - pix2[11]);
3164
        s += abs(pix1[12] - pix2[12]);
3165
        s += abs(pix1[13] - pix2[13]);
3166
        s += abs(pix1[14] - pix2[14]);
3167
        s += abs(pix1[15] - pix2[15]);
3168
        pix1 += line_size;
3169
        pix2 += line_size;
3170
    }
3171
    return s;
3172
}
3173

    
3174
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3175
{
3176
    int s, i;
3177

    
3178
    s = 0;
3179
    for(i=0;i<h;i++) {
3180
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3181
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3182
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3183
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3184
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3185
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3186
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3187
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3188
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3189
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3190
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3191
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3192
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3193
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3194
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3195
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3196
        pix1 += line_size;
3197
        pix2 += line_size;
3198
    }
3199
    return s;
3200
}
3201

    
3202
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3203
{
3204
    int s, i;
3205
    uint8_t *pix3 = pix2 + line_size;
3206

    
3207
    s = 0;
3208
    for(i=0;i<h;i++) {
3209
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3210
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3211
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3212
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3213
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3214
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3215
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3216
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3217
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3218
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3219
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3220
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3221
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3222
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3223
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3224
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3225
        pix1 += line_size;
3226
        pix2 += line_size;
3227
        pix3 += line_size;
3228
    }
3229
    return s;
3230
}
3231

    
3232
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3233
{
3234
    int s, i;
3235
    uint8_t *pix3 = pix2 + line_size;
3236

    
3237
    s = 0;
3238
    for(i=0;i<h;i++) {
3239
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3240
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3241
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3242
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3243
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3244
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3245
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3246
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3247
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3248
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3249
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3250
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3251
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3252
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3253
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3254
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3255
        pix1 += line_size;
3256
        pix2 += line_size;
3257
        pix3 += line_size;
3258
    }
3259
    return s;
3260
}
3261

    
3262
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3263
{
3264
    int s, i;
3265

    
3266
    s = 0;
3267
    for(i=0;i<h;i++) {
3268
        s += abs(pix1[0] - pix2[0]);
3269
        s += abs(pix1[1] - pix2[1]);
3270
        s += abs(pix1[2] - pix2[2]);
3271
        s += abs(pix1[3] - pix2[3]);
3272
        s += abs(pix1[4] - pix2[4]);
3273
        s += abs(pix1[5] - pix2[5]);
3274
        s += abs(pix1[6] - pix2[6]);
3275
        s += abs(pix1[7] - pix2[7]);
3276
        pix1 += line_size;
3277
        pix2 += line_size;
3278
    }
3279
    return s;
3280
}
3281

    
3282
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3283
{
3284
    int s, i;
3285

    
3286
    s = 0;
3287
    for(i=0;i<h;i++) {
3288
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3289
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3290
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3291
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3292
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3293
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3294
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3295
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3296
        pix1 += line_size;
3297
        pix2 += line_size;
3298
    }
3299
    return s;
3300
}
3301

    
3302
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3303
{
3304
    int s, i;
3305
    uint8_t *pix3 = pix2 + line_size;
3306

    
3307
    s = 0;
3308
    for(i=0;i<h;i++) {
3309
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3310
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3311
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3312
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3313
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3314
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3315
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3316
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3317
        pix1 += line_size;
3318
        pix2 += line_size;
3319
        pix3 += line_size;
3320
    }
3321
    return s;
3322
}
3323

    
3324
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3325
{
3326
    int s, i;
3327
    uint8_t *pix3 = pix2 + line_size;
3328

    
3329
    s = 0;
3330
    for(i=0;i<h;i++) {
3331
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3332
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3333
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3334
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3335
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3336
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3337
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3338
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3339
        pix1 += line_size;
3340
        pix2 += line_size;
3341
        pix3 += line_size;
3342
    }
3343
    return s;
3344
}
3345

    
3346
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3347
    MpegEncContext *c = v;
3348
    int score1=0;
3349
    int score2=0;
3350
    int x,y;
3351

    
3352
    for(y=0; y<h; y++){
3353
        for(x=0; x<16; x++){
3354
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3355
        }
3356
        if(y+1<h){
3357
            for(x=0; x<15; x++){
3358
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3359
                             - s1[x+1] + s1[x+1+stride])
3360
                        -FFABS(  s2[x  ] - s2[x  +stride]
3361
                             - s2[x+1] + s2[x+1+stride]);
3362
            }
3363
        }
3364
        s1+= stride;
3365
        s2+= stride;
3366
    }
3367

    
3368
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3369
    else  return score1 + FFABS(score2)*8;
3370
}
3371

    
3372
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3373
    MpegEncContext *c = v;
3374
    int score1=0;
3375
    int score2=0;
3376
    int x,y;
3377

    
3378
    for(y=0; y<h; y++){
3379
        for(x=0; x<8; x++){
3380
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3381
        }
3382
        if(y+1<h){
3383
            for(x=0; x<7; x++){
3384
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3385
                             - s1[x+1] + s1[x+1+stride])
3386
                        -FFABS(  s2[x  ] - s2[x  +stride]
3387
                             - s2[x+1] + s2[x+1+stride]);
3388
            }
3389
        }
3390
        s1+= stride;
3391
        s2+= stride;
3392
    }
3393

    
3394
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3395
    else  return score1 + FFABS(score2)*8;
3396
}
3397

    
3398
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3399
    int i;
3400
    unsigned int sum=0;
3401

    
3402
    for(i=0; i<8*8; i++){
3403
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3404
        int w= weight[i];
3405
        b>>= RECON_SHIFT;
3406
        assert(-512<b && b<512);
3407

    
3408
        sum += (w*b)*(w*b)>>4;
3409
    }
3410
    return sum>>2;
3411
}
3412

    
3413
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3414
    int i;
3415

    
3416
    for(i=0; i<8*8; i++){
3417
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3418
    }
3419
}
3420

    
3421
/**
3422
 * permutes an 8x8 block.
3423
 * @param block the block which will be permuted according to the given permutation vector
3424
 * @param permutation the permutation vector
3425
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3426
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3427
 *                  (inverse) permutated to scantable order!
3428
 */
3429
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3430
{
3431
    int i;
3432
    DCTELEM temp[64];
3433

    
3434
    if(last<=0) return;
3435
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3436

    
3437
    for(i=0; i<=last; i++){
3438
        const int j= scantable[i];
3439
        temp[j]= block[j];
3440
        block[j]=0;
3441
    }
3442

    
3443
    for(i=0; i<=last; i++){
3444
        const int j= scantable[i];
3445
        const int perm_j= permutation[j];
3446
        block[perm_j]= temp[j];
3447
    }
3448
}
3449

    
3450
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3451
    return 0;
3452
}
3453

    
3454
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3455
    int i;
3456

    
3457
    memset(cmp, 0, sizeof(void*)*6);
3458

    
3459
    for(i=0; i<6; i++){
3460
        switch(type&0xFF){
3461
        case FF_CMP_SAD:
3462
            cmp[i]= c->sad[i];
3463
            break;
3464
        case FF_CMP_SATD:
3465
            cmp[i]= c->hadamard8_diff[i];
3466
            break;
3467
        case FF_CMP_SSE:
3468
            cmp[i]= c->sse[i];
3469
            break;
3470
        case FF_CMP_DCT:
3471
            cmp[i]= c->dct_sad[i];
3472
            break;
3473
        case FF_CMP_DCT264:
3474
            cmp[i]= c->dct264_sad[i];
3475
            break;
3476
        case FF_CMP_DCTMAX:
3477
            cmp[i]= c->dct_max[i];
3478
            break;
3479
        case FF_CMP_PSNR:
3480
            cmp[i]= c->quant_psnr[i];
3481
            break;
3482
        case FF_CMP_BIT:
3483
            cmp[i]= c->bit[i];
3484
            break;
3485
        case FF_CMP_RD:
3486
            cmp[i]= c->rd[i];
3487
            break;
3488
        case FF_CMP_VSAD:
3489
            cmp[i]= c->vsad[i];
3490
            break;
3491
        case FF_CMP_VSSE:
3492
            cmp[i]= c->vsse[i];
3493
            break;
3494
        case FF_CMP_ZERO:
3495
            cmp[i]= zero_cmp;
3496
            break;
3497
        case FF_CMP_NSSE:
3498
            cmp[i]= c->nsse[i];
3499
            break;
3500
#if CONFIG_SNOW_ENCODER
3501
        case FF_CMP_W53:
3502
            cmp[i]= c->w53[i];
3503
            break;
3504
        case FF_CMP_W97:
3505
            cmp[i]= c->w97[i];
3506
            break;
3507
#endif
3508
        default:
3509
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3510
        }
3511
    }
3512
}
3513

    
3514
static void clear_block_c(DCTELEM *block)
3515
{
3516
    memset(block, 0, sizeof(DCTELEM)*64);
3517
}
3518

    
3519
/**
3520
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3521
 */
3522
static void clear_blocks_c(DCTELEM *blocks)
3523
{
3524
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3525
}
3526

    
3527
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3528
    long i;
3529
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3530
        long a = *(long*)(src+i);
3531
        long b = *(long*)(dst+i);
3532
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3533
    }
3534
    for(; i<w; i++)
3535
        dst[i+0] += src[i+0];
3536
}
3537

    
3538
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3539
    long i;
3540
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3541
        long a = *(long*)(src1+i);
3542
        long b = *(long*)(src2+i);
3543
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3544
    }
3545
    for(; i<w; i++)
3546
        dst[i] = src1[i]+src2[i];
3547
}
3548

    
3549
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3550
    long i;
3551
#if !HAVE_FAST_UNALIGNED
3552
    if((long)src2 & (sizeof(long)-1)){
3553
        for(i=0; i+7<w; i+=8){
3554
            dst[i+0] = src1[i+0]-src2[i+0];
3555
            dst[i+1] = src1[i+1]-src2[i+1];
3556
            dst[i+2] = src1[i+2]-src2[i+2];
3557
            dst[i+3] = src1[i+3]-src2[i+3];
3558
            dst[i+4] = src1[i+4]-src2[i+4];
3559
            dst[i+5] = src1[i+5]-src2[i+5];
3560
            dst[i+6] = src1[i+6]-src2[i+6];
3561
            dst[i+7] = src1[i+7]-src2[i+7];
3562
        }
3563
    }else
3564
#endif
3565
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3566
        long a = *(long*)(src1+i);
3567
        long b = *(long*)(src2+i);
3568
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3569
    }
3570
    for(; i<w; i++)
3571
        dst[i+0] = src1[i+0]-src2[i+0];
3572
}
3573

    
3574
static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3575
    int i;
3576
    uint8_t l, lt;
3577

    
3578
    l= *left;
3579
    lt= *left_top;
3580

    
3581
    for(i=0; i<w; i++){
3582
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3583
        lt= src1[i];
3584
        dst[i]= l;
3585
    }
3586

    
3587
    *left= l;
3588
    *left_top= lt;
3589
}
3590

    
3591
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3592
    int i;
3593
    uint8_t l, lt;
3594

    
3595
    l= *left;
3596
    lt= *left_top;
3597

    
3598
    for(i=0; i<w; i++){
3599
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3600
        lt= src1[i];
3601
        l= src2[i];
3602
        dst[i]= l - pred;
3603
    }
3604

    
3605
    *left= l;
3606
    *left_top= lt;
3607
}
3608

    
3609
#define BUTTERFLY2(o1,o2,i1,i2) \
3610
o1= (i1)+(i2);\
3611
o2= (i1)-(i2);
3612

    
3613
#define BUTTERFLY1(x,y) \
3614
{\
3615
    int a,b;\
3616
    a= x;\
3617
    b= y;\
3618
    x= a+b;\
3619
    y= a-b;\
3620
}
3621

    
3622
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3623

    
3624
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3625
    int i;
3626
    int temp[64];
3627
    int sum=0;
3628

    
3629
    assert(h==8);
3630

    
3631
    for(i=0; i<8; i++){
3632
        //FIXME try pointer walks
3633
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3634
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3635
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3636
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3637

    
3638
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3639
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3640
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3641
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3642

    
3643
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3644
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3645
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3646
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3647
    }
3648

    
3649
    for(i=0; i<8; i++){
3650
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3651
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3652
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3653
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3654

    
3655
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3656
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3657
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3658
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3659

    
3660
        sum +=
3661
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3662
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3663
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3664
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3665
    }
3666
#if 0
3667
static int maxi=0;
3668
if(sum>maxi){
3669
    maxi=sum;
3670
    printf("MAX:%d\n", maxi);
3671
}
3672
#endif
3673
    return sum;
3674
}
3675

    
3676
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3677
    int i;
3678
    int temp[64];
3679
    int sum=0;
3680

    
3681
    assert(h==8);
3682

    
3683
    for(i=0; i<8; i++){
3684
        //FIXME try pointer walks
3685
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3686
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3687
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3688
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3689

    
3690
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3691
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3692
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3693
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3694

    
3695
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3696
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3697
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3698
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3699
    }
3700

    
3701
    for(i=0; i<8; i++){
3702
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3703
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3704
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3705
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3706

    
3707
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3708
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3709
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3710
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3711

    
3712
        sum +=
3713
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3714
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3715
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3716
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3717
    }
3718

    
3719
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3720

    
3721
    return sum;
3722
}
3723

    
3724
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3725
    MpegEncContext * const s= (MpegEncContext *)c;
3726
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3727
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3728

    
3729
    assert(h==8);
3730

    
3731
    s->dsp.diff_pixels(temp, src1, src2, stride);
3732
    s->dsp.fdct(temp);
3733
    return s->dsp.sum_abs_dctelem(temp);
3734
}
3735

    
3736
#if CONFIG_GPL
3737
#define DCT8_1D {\
3738
    const int s07 = SRC(0) + SRC(7);\
3739
    const int s16 = SRC(1) + SRC(6);\
3740
    const int s25 = SRC(2) + SRC(5);\
3741
    const int s34 = SRC(3) + SRC(4);\
3742
    const int a0 = s07 + s34;\
3743
    const int a1 = s16 + s25;\
3744
    const int a2 = s07 - s34;\
3745
    const int a3 = s16 - s25;\
3746
    const int d07 = SRC(0) - SRC(7);\
3747
    const int d16 = SRC(1) - SRC(6);\
3748
    const int d25 = SRC(2) - SRC(5);\
3749
    const int d34 = SRC(3) - SRC(4);\
3750
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3751
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3752
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3753
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3754
    DST(0,  a0 + a1     ) ;\
3755
    DST(1,  a4 + (a7>>2)) ;\
3756
    DST(2,  a2 + (a3>>1)) ;\
3757
    DST(3,  a5 + (a6>>2)) ;\
3758
    DST(4,  a0 - a1     ) ;\
3759
    DST(5,  a6 - (a5>>2)) ;\
3760
    DST(6, (a2>>1) - a3 ) ;\
3761
    DST(7, (a4>>2) - a7 ) ;\
3762
}
3763

    
3764
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3765
    MpegEncContext * const s= (MpegEncContext *)c;
3766
    DCTELEM dct[8][8];
3767
    int i;
3768
    int sum=0;
3769

    
3770
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3771

    
3772
#define SRC(x) dct[i][x]
3773
#define DST(x,v) dct[i][x]= v
3774
    for( i = 0; i < 8; i++ )
3775
        DCT8_1D
3776
#undef SRC
3777
#undef DST
3778

    
3779
#define SRC(x) dct[x][i]
3780
#define DST(x,v) sum += FFABS(v)
3781
    for( i = 0; i < 8; i++ )
3782
        DCT8_1D
3783
#undef SRC
3784
#undef DST
3785
    return sum;
3786
}
3787
#endif
3788

    
3789
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3790
    MpegEncContext * const s= (MpegEncContext *)c;
3791
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3792
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3793
    int sum=0, i;
3794

    
3795
    assert(h==8);
3796

    
3797
    s->dsp.diff_pixels(temp, src1, src2, stride);
3798
    s->dsp.fdct(temp);
3799

    
3800
    for(i=0; i<64; i++)
3801
        sum= FFMAX(sum, FFABS(temp[i]));
3802

    
3803
    return sum;
3804
}
3805

    
3806
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3807
    MpegEncContext * const s= (MpegEncContext *)c;
3808
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3809
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3810
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3811
    int sum=0, i;
3812

    
3813
    assert(h==8);
3814
    s->mb_intra=0;
3815

    
3816
    s->dsp.diff_pixels(temp, src1, src2, stride);
3817

    
3818
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3819

    
3820
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3821
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3822
    ff_simple_idct(temp); //FIXME
3823

    
3824
    for(i=0; i<64; i++)
3825
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3826

    
3827
    return sum;
3828
}
3829

    
3830
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3831
    MpegEncContext * const s= (MpegEncContext *)c;
3832
    const uint8_t *scantable= s->intra_scantable.permutated;
3833
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3834
    DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3835
    DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3836
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3837
    uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3838
    uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3839
    int i, last, run, bits, level, distortion, start_i;
3840
    const int esc_length= s->ac_esc_length;
3841
    uint8_t * length;
3842
    uint8_t * last_length;
3843

    
3844
    assert(h==8);
3845

    
3846
    copy_block8(lsrc1, src1, 8, stride, 8);
3847
    copy_block8(lsrc2, src2, 8, stride, 8);
3848

    
3849
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3850

    
3851
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3852

    
3853
    bits=0;
3854

    
3855
    if (s->mb_intra) {
3856
        start_i = 1;
3857
        length     = s->intra_ac_vlc_length;
3858
        last_length= s->intra_ac_vlc_last_length;
3859
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3860
    } else {
3861
        start_i = 0;
3862
        length     = s->inter_ac_vlc_length;
3863
        last_length= s->inter_ac_vlc_last_length;
3864
    }
3865

    
3866
    if(last>=start_i){
3867
        run=0;
3868
        for(i=start_i; i<last; i++){
3869
            int j= scantable[i];
3870
            level= temp[j];
3871

    
3872
            if(level){
3873
                level+=64;
3874
                if((level&(~127)) == 0){
3875
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3876
                }else
3877
                    bits+= esc_length;
3878
                run=0;
3879
            }else
3880
                run++;
3881
        }
3882
        i= scantable[last];
3883

    
3884
        level= temp[i] + 64;
3885

    
3886
        assert(level - 64);
3887

    
3888
        if((level&(~127)) == 0){
3889
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3890
        }else
3891
            bits+= esc_length;
3892

    
3893
    }
3894

    
3895
    if(last>=0){
3896
        if(s->mb_intra)
3897
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3898
        else
3899
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3900
    }
3901

    
3902
    s->dsp.idct_add(lsrc2, 8, temp);
3903

    
3904
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3905

    
3906
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3907
}
3908

    
3909
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3910
    MpegEncContext * const s= (MpegEncContext *)c;
3911
    const uint8_t *scantable= s->intra_scantable.permutated;
3912
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3913
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3914
    int i, last, run, bits, level, start_i;
3915
    const int esc_length= s->ac_esc_length;
3916
    uint8_t * length;
3917
    uint8_t * last_length;
3918

    
3919
    assert(h==8);
3920

    
3921
    s->dsp.diff_pixels(temp, src1, src2, stride);
3922

    
3923
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3924

    
3925
    bits=0;
3926

    
3927
    if (s->mb_intra) {
3928
        start_i = 1;
3929
        length     = s->intra_ac_vlc_length;
3930
        last_length= s->intra_ac_vlc_last_length;
3931
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3932
    } else {
3933
        start_i = 0;
3934
        length     = s->inter_ac_vlc_length;
3935
        last_length= s->inter_ac_vlc_last_length;
3936
    }
3937

    
3938
    if(last>=start_i){
3939
        run=0;
3940
        for(i=start_i; i<last; i++){
3941
            int j= scantable[i];
3942
            level= temp[j];
3943

    
3944
            if(level){
3945
                level+=64;
3946
                if((level&(~127)) == 0){
3947
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3948
                }else
3949
                    bits+= esc_length;
3950
                run=0;
3951
            }else
3952
                run++;
3953
        }
3954
        i= scantable[last];
3955

    
3956
        level= temp[i] + 64;
3957

    
3958
        assert(level - 64);
3959

    
3960
        if((level&(~127)) == 0){
3961
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3962
        }else
3963
            bits+= esc_length;
3964
    }
3965

    
3966
    return bits;
3967
}
3968

    
3969
#define VSAD_INTRA(size) \
3970
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3971
    int score=0;                                                                                            \
3972
    int x,y;                                                                                                \
3973
                                                                                                            \
3974
    for(y=1; y<h; y++){                                                                                     \
3975
        for(x=0; x<size; x+=4){                                                                             \
3976
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3977
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3978
        }                                                                                                   \
3979
        s+= stride;                                                                                         \
3980
    }                                                                                                       \
3981
                                                                                                            \
3982
    return score;                                                                                           \
3983
}
3984
VSAD_INTRA(8)
3985
VSAD_INTRA(16)
3986

    
3987
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3988
    int score=0;
3989
    int x,y;
3990

    
3991
    for(y=1; y<h; y++){
3992
        for(x=0; x<16; x++){
3993
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3994
        }
3995
        s1+= stride;
3996
        s2+= stride;
3997
    }
3998

    
3999
    return score;
4000
}
4001

    
4002
#define SQ(a) ((a)*(a))
4003
#define VSSE_INTRA(size) \
4004
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4005
    int score=0;                                                                                            \
4006
    int x,y;                                                                                                \
4007
                                                                                                            \
4008
    for(y=1; y<h; y++){                                                                                     \
4009
        for(x=0; x<size; x+=4){                                                                               \
4010
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4011
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4012
        }                                                                                                   \
4013
        s+= stride;                                                                                         \
4014
    }                                                                                                       \
4015
                                                                                                            \
4016
    return score;                                                                                           \
4017
}
4018
VSSE_INTRA(8)
4019
VSSE_INTRA(16)
4020

    
4021
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4022
    int score=0;
4023
    int x,y;
4024

    
4025
    for(y=1; y<h; y++){
4026
        for(x=0; x<16; x++){
4027
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4028
        }
4029
        s1+= stride;
4030
        s2+= stride;
4031
    }
4032

    
4033
    return score;
4034
}
4035

    
4036
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4037
                               int size){
4038
    int score=0;
4039
    int i;
4040
    for(i=0; i<size; i++)
4041
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4042
    return score;
4043
}
4044

    
4045
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4046
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4047
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4048
#if CONFIG_GPL
4049
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4050
#endif
4051
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4052
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4053
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4054
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4055

    
4056
static void vector_fmul_c(float *dst, const float *src, int len){
4057
    int i;
4058
    for(i=0; i<len; i++)
4059
        dst[i] *= src[i];
4060
}
4061

    
4062
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4063
    int i;
4064
    src1 += len-1;
4065
    for(i=0; i<len; i++)
4066
        dst[i] = src0[i] * src1[-i];
4067
}
4068

    
4069
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4070
    int i;
4071
    for(i=0; i<len; i++)
4072
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4073
}
4074

    
4075
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4076
    int i,j;
4077
    dst += len;
4078
    win += len;
4079
    src0+= len;
4080
    for(i=-len, j=len-1; i<0; i++, j--) {
4081
        float s0 = src0[i];
4082
        float s1 = src1[j];
4083
        float wi = win[i];
4084
        float wj = win[j];
4085
        dst[i] = s0*wj - s1*wi + add_bias;
4086
        dst[j] = s0*wi + s1*wj + add_bias;
4087
    }
4088
}
4089

    
4090
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4091
    int i;
4092
    for(i=0; i<len; i++)
4093
        dst[i] = src[i] * mul;
4094
}
4095

    
4096
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4097
                   uint32_t maxi, uint32_t maxisign)
4098
{
4099

    
4100
    if(a > mini) return mini;
4101
    else if((a^(1<<31)) > maxisign) return maxi;
4102
    else return a;
4103
}
4104

    
4105
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4106
    int i;
4107
    uint32_t mini = *(uint32_t*)min;
4108
    uint32_t maxi = *(uint32_t*)max;
4109
    uint32_t maxisign = maxi ^ (1<<31);
4110
    uint32_t *dsti = (uint32_t*)dst;
4111
    const uint32_t *srci = (const uint32_t*)src;
4112
    for(i=0; i<len; i+=8) {
4113
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4114
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4115
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4116
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4117
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4118
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4119
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4120
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4121
    }
4122
}
4123
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4124
    int i;
4125
    if(min < 0 && max > 0) {
4126
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4127
    } else {
4128
        for(i=0; i < len; i+=8) {
4129
            dst[i    ] = av_clipf(src[i    ], min, max);
4130
            dst[i + 1] = av_clipf(src[i + 1], min, max);
4131
            dst[i + 2] = av_clipf(src[i + 2], min, max);
4132
            dst[i + 3] = av_clipf(src[i + 3], min, max);
4133
            dst[i + 4] = av_clipf(src[i + 4], min, max);
4134
            dst[i + 5] = av_clipf(src[i + 5], min, max);
4135
            dst[i + 6] = av_clipf(src[i + 6], min, max);
4136
            dst[i + 7] = av_clipf(src[i + 7], min, max);
4137
        }
4138
    }
4139
}
4140

    
4141
static av_always_inline int float_to_int16_one(const float *src){
4142
    int_fast32_t tmp = *(const int32_t*)src;
4143
    if(tmp & 0xf0000){
4144
        tmp = (0x43c0ffff - tmp)>>31;
4145
        // is this faster on some gcc/cpu combinations?
4146
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4147
//      else                 tmp = 0;
4148
    }
4149
    return tmp - 0x8000;
4150
}
4151

    
4152
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4153
    int i;
4154
    for(i=0; i<len; i++)
4155
        dst[i] = float_to_int16_one(src+i);
4156
}
4157

    
4158
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4159
    int i,j,c;
4160
    if(channels==2){
4161
        for(i=0; i<len; i++){
4162
            dst[2*i]   = float_to_int16_one(src[0]+i);
4163
            dst[2*i+1] = float_to_int16_one(src[1]+i);
4164
        }
4165
    }else{
4166
        for(c=0; c<channels; c++)
4167
            for(i=0, j=c; i<len; i++, j+=channels)
4168
                dst[j] = float_to_int16_one(src[c]+i);
4169
    }
4170
}
4171

    
4172
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4173
{
4174
    while (order--)
4175
       *v1++ += *v2++;
4176
}
4177

    
4178
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4179
{
4180
    while (order--)
4181
        *v1++ -= *v2++;
4182
}
4183

    
4184
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4185
{
4186
    int res = 0;
4187

    
4188
    while (order--)
4189
        res += (*v1++ * *v2++) >> shift;
4190

    
4191
    return res;
4192
}
4193

    
4194
#define W0 2048
4195
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4196
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4197
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4198
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4199
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4200
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4201
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4202

    
4203
static void wmv2_idct_row(short * b)
4204
{
4205
    int s1,s2;
4206
    int a0,a1,a2,a3,a4,a5,a6,a7;
4207
    /*step 1*/
4208
    a1 = W1*b[1]+W7*b[7];
4209
    a7 = W7*b[1]-W1*b[7];
4210
    a5 = W5*b[5]+W3*b[3];
4211
    a3 = W3*b[5]-W5*b[3];
4212
    a2 = W2*b[2]+W6*b[6];
4213
    a6 = W6*b[2]-W2*b[6];
4214
    a0 = W0*b[0]+W0*b[4];
4215
    a4 = W0*b[0]-W0*b[4];
4216
    /*step 2*/
4217
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4218
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4219
    /*step 3*/
4220
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4221
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4222
    b[2] = (a4-a6 +s2   + (1<<7))>>8