Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 4052cbf1

History | View | Annotate | Download (173 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "snow.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39

    
40
/* snow.c */
41
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42

    
43
/* vorbis.c */
44
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45

    
46
/* ac3dec.c */
47
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48

    
49
/* lpc.c */
50
void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
51

    
52
/* pngdec.c */
53
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54

    
55
/* eaidct.c */
56
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57

    
58
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
59
uint32_t ff_squareTbl[512] = {0, };
60

    
61
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
62
#define pb_7f (~0UL/255 * 0x7f)
63
#define pb_80 (~0UL/255 * 0x80)
64

    
65
const uint8_t ff_zigzag_direct[64] = {
66
    0,   1,  8, 16,  9,  2,  3, 10,
67
    17, 24, 32, 25, 18, 11,  4,  5,
68
    12, 19, 26, 33, 40, 48, 41, 34,
69
    27, 20, 13,  6,  7, 14, 21, 28,
70
    35, 42, 49, 56, 57, 50, 43, 36,
71
    29, 22, 15, 23, 30, 37, 44, 51,
72
    58, 59, 52, 45, 38, 31, 39, 46,
73
    53, 60, 61, 54, 47, 55, 62, 63
74
};
75

    
76
/* Specific zigzag scan for 248 idct. NOTE that unlike the
77
   specification, we interleave the fields */
78
const uint8_t ff_zigzag248_direct[64] = {
79
     0,  8,  1,  9, 16, 24,  2, 10,
80
    17, 25, 32, 40, 48, 56, 33, 41,
81
    18, 26,  3, 11,  4, 12, 19, 27,
82
    34, 42, 49, 57, 50, 58, 35, 43,
83
    20, 28,  5, 13,  6, 14, 21, 29,
84
    36, 44, 51, 59, 52, 60, 37, 45,
85
    22, 30,  7, 15, 23, 31, 38, 46,
86
    53, 61, 54, 62, 39, 47, 55, 63,
87
};
88

    
89
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
90
DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
91

    
92
const uint8_t ff_alternate_horizontal_scan[64] = {
93
    0,  1,   2,  3,  8,  9, 16, 17,
94
    10, 11,  4,  5,  6,  7, 15, 14,
95
    13, 12, 19, 18, 24, 25, 32, 33,
96
    26, 27, 20, 21, 22, 23, 28, 29,
97
    30, 31, 34, 35, 40, 41, 48, 49,
98
    42, 43, 36, 37, 38, 39, 44, 45,
99
    46, 47, 50, 51, 56, 57, 58, 59,
100
    52, 53, 54, 55, 60, 61, 62, 63,
101
};
102

    
103
const uint8_t ff_alternate_vertical_scan[64] = {
104
    0,  8,  16, 24,  1,  9,  2, 10,
105
    17, 25, 32, 40, 48, 56, 57, 49,
106
    41, 33, 26, 18,  3, 11,  4, 12,
107
    19, 27, 34, 42, 50, 58, 35, 43,
108
    51, 59, 20, 28,  5, 13,  6, 14,
109
    21, 29, 36, 44, 52, 60, 37, 45,
110
    53, 61, 22, 30,  7, 15, 23, 31,
111
    38, 46, 54, 62, 39, 47, 55, 63,
112
};
113

    
114
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
115
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
116
const uint32_t ff_inverse[257]={
117
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
118
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
119
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
120
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
121
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
122
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
123
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
124
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
125
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
126
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
127
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
128
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
129
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
130
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
131
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
132
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
133
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
134
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
135
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
136
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
137
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
138
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
139
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
140
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
141
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
142
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
143
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
144
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
145
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
146
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
147
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
148
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
149
  16777216
150
};
151

    
152
/* Input permutation for the simple_idct_mmx */
153
static const uint8_t simple_mmx_permutation[64]={
154
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
155
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
156
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
157
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
158
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
159
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
160
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
161
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
162
};
163

    
164
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
165

    
166
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
167
    int i;
168
    int end;
169

    
170
    st->scantable= src_scantable;
171

    
172
    for(i=0; i<64; i++){
173
        int j;
174
        j = src_scantable[i];
175
        st->permutated[i] = permutation[j];
176
#if ARCH_PPC
177
        st->inverse[j] = i;
178
#endif
179
    }
180

    
181
    end=-1;
182
    for(i=0; i<64; i++){
183
        int j;
184
        j = st->permutated[i];
185
        if(j>end) end=j;
186
        st->raster_end[i]= end;
187
    }
188
}
189

    
190
static int pix_sum_c(uint8_t * pix, int line_size)
191
{
192
    int s, i, j;
193

    
194
    s = 0;
195
    for (i = 0; i < 16; i++) {
196
        for (j = 0; j < 16; j += 8) {
197
            s += pix[0];
198
            s += pix[1];
199
            s += pix[2];
200
            s += pix[3];
201
            s += pix[4];
202
            s += pix[5];
203
            s += pix[6];
204
            s += pix[7];
205
            pix += 8;
206
        }
207
        pix += line_size - 16;
208
    }
209
    return s;
210
}
211

    
212
static int pix_norm1_c(uint8_t * pix, int line_size)
213
{
214
    int s, i, j;
215
    uint32_t *sq = ff_squareTbl + 256;
216

    
217
    s = 0;
218
    for (i = 0; i < 16; i++) {
219
        for (j = 0; j < 16; j += 8) {
220
#if 0
221
            s += sq[pix[0]];
222
            s += sq[pix[1]];
223
            s += sq[pix[2]];
224
            s += sq[pix[3]];
225
            s += sq[pix[4]];
226
            s += sq[pix[5]];
227
            s += sq[pix[6]];
228
            s += sq[pix[7]];
229
#else
230
#if LONG_MAX > 2147483647
231
            register uint64_t x=*(uint64_t*)pix;
232
            s += sq[x&0xff];
233
            s += sq[(x>>8)&0xff];
234
            s += sq[(x>>16)&0xff];
235
            s += sq[(x>>24)&0xff];
236
            s += sq[(x>>32)&0xff];
237
            s += sq[(x>>40)&0xff];
238
            s += sq[(x>>48)&0xff];
239
            s += sq[(x>>56)&0xff];
240
#else
241
            register uint32_t x=*(uint32_t*)pix;
242
            s += sq[x&0xff];
243
            s += sq[(x>>8)&0xff];
244
            s += sq[(x>>16)&0xff];
245
            s += sq[(x>>24)&0xff];
246
            x=*(uint32_t*)(pix+4);
247
            s += sq[x&0xff];
248
            s += sq[(x>>8)&0xff];
249
            s += sq[(x>>16)&0xff];
250
            s += sq[(x>>24)&0xff];
251
#endif
252
#endif
253
            pix += 8;
254
        }
255
        pix += line_size - 16;
256
    }
257
    return s;
258
}
259

    
260
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
261
    int i;
262

    
263
    for(i=0; i+8<=w; i+=8){
264
        dst[i+0]= bswap_32(src[i+0]);
265
        dst[i+1]= bswap_32(src[i+1]);
266
        dst[i+2]= bswap_32(src[i+2]);
267
        dst[i+3]= bswap_32(src[i+3]);
268
        dst[i+4]= bswap_32(src[i+4]);
269
        dst[i+5]= bswap_32(src[i+5]);
270
        dst[i+6]= bswap_32(src[i+6]);
271
        dst[i+7]= bswap_32(src[i+7]);
272
    }
273
    for(;i<w; i++){
274
        dst[i+0]= bswap_32(src[i+0]);
275
    }
276
}
277

    
278
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
279
{
280
    int s, i;
281
    uint32_t *sq = ff_squareTbl + 256;
282

    
283
    s = 0;
284
    for (i = 0; i < h; i++) {
285
        s += sq[pix1[0] - pix2[0]];
286
        s += sq[pix1[1] - pix2[1]];
287
        s += sq[pix1[2] - pix2[2]];
288
        s += sq[pix1[3] - pix2[3]];
289
        pix1 += line_size;
290
        pix2 += line_size;
291
    }
292
    return s;
293
}
294

    
295
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
296
{
297
    int s, i;
298
    uint32_t *sq = ff_squareTbl + 256;
299

    
300
    s = 0;
301
    for (i = 0; i < h; i++) {
302
        s += sq[pix1[0] - pix2[0]];
303
        s += sq[pix1[1] - pix2[1]];
304
        s += sq[pix1[2] - pix2[2]];
305
        s += sq[pix1[3] - pix2[3]];
306
        s += sq[pix1[4] - pix2[4]];
307
        s += sq[pix1[5] - pix2[5]];
308
        s += sq[pix1[6] - pix2[6]];
309
        s += sq[pix1[7] - pix2[7]];
310
        pix1 += line_size;
311
        pix2 += line_size;
312
    }
313
    return s;
314
}
315

    
316
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
317
{
318
    int s, i;
319
    uint32_t *sq = ff_squareTbl + 256;
320

    
321
    s = 0;
322
    for (i = 0; i < h; i++) {
323
        s += sq[pix1[ 0] - pix2[ 0]];
324
        s += sq[pix1[ 1] - pix2[ 1]];
325
        s += sq[pix1[ 2] - pix2[ 2]];
326
        s += sq[pix1[ 3] - pix2[ 3]];
327
        s += sq[pix1[ 4] - pix2[ 4]];
328
        s += sq[pix1[ 5] - pix2[ 5]];
329
        s += sq[pix1[ 6] - pix2[ 6]];
330
        s += sq[pix1[ 7] - pix2[ 7]];
331
        s += sq[pix1[ 8] - pix2[ 8]];
332
        s += sq[pix1[ 9] - pix2[ 9]];
333
        s += sq[pix1[10] - pix2[10]];
334
        s += sq[pix1[11] - pix2[11]];
335
        s += sq[pix1[12] - pix2[12]];
336
        s += sq[pix1[13] - pix2[13]];
337
        s += sq[pix1[14] - pix2[14]];
338
        s += sq[pix1[15] - pix2[15]];
339

    
340
        pix1 += line_size;
341
        pix2 += line_size;
342
    }
343
    return s;
344
}
345

    
346

    
347
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
348
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
349
    int s, i, j;
350
    const int dec_count= w==8 ? 3 : 4;
351
    int tmp[32*32];
352
    int level, ori;
353
    static const int scale[2][2][4][4]={
354
      {
355
        {
356
            // 9/7 8x8 dec=3
357
            {268, 239, 239, 213},
358
            {  0, 224, 224, 152},
359
            {  0, 135, 135, 110},
360
        },{
361
            // 9/7 16x16 or 32x32 dec=4
362
            {344, 310, 310, 280},
363
            {  0, 320, 320, 228},
364
            {  0, 175, 175, 136},
365
            {  0, 129, 129, 102},
366
        }
367
      },{
368
        {
369
            // 5/3 8x8 dec=3
370
            {275, 245, 245, 218},
371
            {  0, 230, 230, 156},
372
            {  0, 138, 138, 113},
373
        },{
374
            // 5/3 16x16 or 32x32 dec=4
375
            {352, 317, 317, 286},
376
            {  0, 328, 328, 233},
377
            {  0, 180, 180, 140},
378
            {  0, 132, 132, 105},
379
        }
380
      }
381
    };
382

    
383
    for (i = 0; i < h; i++) {
384
        for (j = 0; j < w; j+=4) {
385
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
386
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
387
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
388
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
389
        }
390
        pix1 += line_size;
391
        pix2 += line_size;
392
    }
393

    
394
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
395

    
396
    s=0;
397
    assert(w==h);
398
    for(level=0; level<dec_count; level++){
399
        for(ori= level ? 1 : 0; ori<4; ori++){
400
            int size= w>>(dec_count-level);
401
            int sx= (ori&1) ? size : 0;
402
            int stride= 32<<(dec_count-level);
403
            int sy= (ori&2) ? stride>>1 : 0;
404

    
405
            for(i=0; i<size; i++){
406
                for(j=0; j<size; j++){
407
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
408
                    s += FFABS(v);
409
                }
410
            }
411
        }
412
    }
413
    assert(s>=0);
414
    return s>>9;
415
}
416

    
417
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
419
}
420

    
421
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
423
}
424

    
425
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
427
}
428

    
429
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
431
}
432

    
433
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
435
}
436

    
437
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
438
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
439
}
440
#endif
441

    
442
/* draw the edges of width 'w' of an image of size width, height */
443
//FIXME check that this is ok for mpeg4 interlaced
444
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
445
{
446
    uint8_t *ptr, *last_line;
447
    int i;
448

    
449
    last_line = buf + (height - 1) * wrap;
450
    for(i=0;i<w;i++) {
451
        /* top and bottom */
452
        memcpy(buf - (i + 1) * wrap, buf, width);
453
        memcpy(last_line + (i + 1) * wrap, last_line, width);
454
    }
455
    /* left and right */
456
    ptr = buf;
457
    for(i=0;i<height;i++) {
458
        memset(ptr - w, ptr[0], w);
459
        memset(ptr + width, ptr[width-1], w);
460
        ptr += wrap;
461
    }
462
    /* corners */
463
    for(i=0;i<w;i++) {
464
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
465
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
466
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
467
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
468
    }
469
}
470

    
471
/**
472
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
473
 * @param buf destination buffer
474
 * @param src source buffer
475
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
476
 * @param block_w width of block
477
 * @param block_h height of block
478
 * @param src_x x coordinate of the top left sample of the block in the source buffer
479
 * @param src_y y coordinate of the top left sample of the block in the source buffer
480
 * @param w width of the source buffer
481
 * @param h height of the source buffer
482
 */
483
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
484
                                    int src_x, int src_y, int w, int h){
485
    int x, y;
486
    int start_y, start_x, end_y, end_x;
487

    
488
    if(src_y>= h){
489
        src+= (h-1-src_y)*linesize;
490
        src_y=h-1;
491
    }else if(src_y<=-block_h){
492
        src+= (1-block_h-src_y)*linesize;
493
        src_y=1-block_h;
494
    }
495
    if(src_x>= w){
496
        src+= (w-1-src_x);
497
        src_x=w-1;
498
    }else if(src_x<=-block_w){
499
        src+= (1-block_w-src_x);
500
        src_x=1-block_w;
501
    }
502

    
503
    start_y= FFMAX(0, -src_y);
504
    start_x= FFMAX(0, -src_x);
505
    end_y= FFMIN(block_h, h-src_y);
506
    end_x= FFMIN(block_w, w-src_x);
507

    
508
    // copy existing part
509
    for(y=start_y; y<end_y; y++){
510
        for(x=start_x; x<end_x; x++){
511
            buf[x + y*linesize]= src[x + y*linesize];
512
        }
513
    }
514

    
515
    //top
516
    for(y=0; y<start_y; y++){
517
        for(x=start_x; x<end_x; x++){
518
            buf[x + y*linesize]= buf[x + start_y*linesize];
519
        }
520
    }
521

    
522
    //bottom
523
    for(y=end_y; y<block_h; y++){
524
        for(x=start_x; x<end_x; x++){
525
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
526
        }
527
    }
528

    
529
    for(y=0; y<block_h; y++){
530
       //left
531
        for(x=0; x<start_x; x++){
532
            buf[x + y*linesize]= buf[start_x + y*linesize];
533
        }
534

    
535
       //right
536
        for(x=end_x; x<block_w; x++){
537
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
538
        }
539
    }
540
}
541

    
542
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
543
{
544
    int i;
545

    
546
    /* read the pixels */
547
    for(i=0;i<8;i++) {
548
        block[0] = pixels[0];
549
        block[1] = pixels[1];
550
        block[2] = pixels[2];
551
        block[3] = pixels[3];
552
        block[4] = pixels[4];
553
        block[5] = pixels[5];
554
        block[6] = pixels[6];
555
        block[7] = pixels[7];
556
        pixels += line_size;
557
        block += 8;
558
    }
559
}
560

    
561
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
562
                          const uint8_t *s2, int stride){
563
    int i;
564

    
565
    /* read the pixels */
566
    for(i=0;i<8;i++) {
567
        block[0] = s1[0] - s2[0];
568
        block[1] = s1[1] - s2[1];
569
        block[2] = s1[2] - s2[2];
570
        block[3] = s1[3] - s2[3];
571
        block[4] = s1[4] - s2[4];
572
        block[5] = s1[5] - s2[5];
573
        block[6] = s1[6] - s2[6];
574
        block[7] = s1[7] - s2[7];
575
        s1 += stride;
576
        s2 += stride;
577
        block += 8;
578
    }
579
}
580

    
581

    
582
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
583
                                 int line_size)
584
{
585
    int i;
586
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
587

    
588
    /* read the pixels */
589
    for(i=0;i<8;i++) {
590
        pixels[0] = cm[block[0]];
591
        pixels[1] = cm[block[1]];
592
        pixels[2] = cm[block[2]];
593
        pixels[3] = cm[block[3]];
594
        pixels[4] = cm[block[4]];
595
        pixels[5] = cm[block[5]];
596
        pixels[6] = cm[block[6]];
597
        pixels[7] = cm[block[7]];
598

    
599
        pixels += line_size;
600
        block += 8;
601
    }
602
}
603

    
604
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
605
                                 int line_size)
606
{
607
    int i;
608
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
609

    
610
    /* read the pixels */
611
    for(i=0;i<4;i++) {
612
        pixels[0] = cm[block[0]];
613
        pixels[1] = cm[block[1]];
614
        pixels[2] = cm[block[2]];
615
        pixels[3] = cm[block[3]];
616

    
617
        pixels += line_size;
618
        block += 8;
619
    }
620
}
621

    
622
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
623
                                 int line_size)
624
{
625
    int i;
626
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
627

    
628
    /* read the pixels */
629
    for(i=0;i<2;i++) {
630
        pixels[0] = cm[block[0]];
631
        pixels[1] = cm[block[1]];
632

    
633
        pixels += line_size;
634
        block += 8;
635
    }
636
}
637

    
638
static void put_signed_pixels_clamped_c(const DCTELEM *block,
639
                                        uint8_t *restrict pixels,
640
                                        int line_size)
641
{
642
    int i, j;
643

    
644
    for (i = 0; i < 8; i++) {
645
        for (j = 0; j < 8; j++) {
646
            if (*block < -128)
647
                *pixels = 0;
648
            else if (*block > 127)
649
                *pixels = 255;
650
            else
651
                *pixels = (uint8_t)(*block + 128);
652
            block++;
653
            pixels++;
654
        }
655
        pixels += (line_size - 8);
656
    }
657
}
658

    
659
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
660
                          int line_size)
661
{
662
    int i;
663
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
664

    
665
    /* read the pixels */
666
    for(i=0;i<8;i++) {
667
        pixels[0] = cm[pixels[0] + block[0]];
668
        pixels[1] = cm[pixels[1] + block[1]];
669
        pixels[2] = cm[pixels[2] + block[2]];
670
        pixels[3] = cm[pixels[3] + block[3]];
671
        pixels[4] = cm[pixels[4] + block[4]];
672
        pixels[5] = cm[pixels[5] + block[5]];
673
        pixels[6] = cm[pixels[6] + block[6]];
674
        pixels[7] = cm[pixels[7] + block[7]];
675
        pixels += line_size;
676
        block += 8;
677
    }
678
}
679

    
680
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
681
                          int line_size)
682
{
683
    int i;
684
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
685

    
686
    /* read the pixels */
687
    for(i=0;i<4;i++) {
688
        pixels[0] = cm[pixels[0] + block[0]];
689
        pixels[1] = cm[pixels[1] + block[1]];
690
        pixels[2] = cm[pixels[2] + block[2]];
691
        pixels[3] = cm[pixels[3] + block[3]];
692
        pixels += line_size;
693
        block += 8;
694
    }
695
}
696

    
697
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
698
                          int line_size)
699
{
700
    int i;
701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
702

    
703
    /* read the pixels */
704
    for(i=0;i<2;i++) {
705
        pixels[0] = cm[pixels[0] + block[0]];
706
        pixels[1] = cm[pixels[1] + block[1]];
707
        pixels += line_size;
708
        block += 8;
709
    }
710
}
711

    
712
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
713
{
714
    int i;
715
    for(i=0;i<8;i++) {
716
        pixels[0] += block[0];
717
        pixels[1] += block[1];
718
        pixels[2] += block[2];
719
        pixels[3] += block[3];
720
        pixels[4] += block[4];
721
        pixels[5] += block[5];
722
        pixels[6] += block[6];
723
        pixels[7] += block[7];
724
        pixels += line_size;
725
        block += 8;
726
    }
727
}
728

    
729
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
730
{
731
    int i;
732
    for(i=0;i<4;i++) {
733
        pixels[0] += block[0];
734
        pixels[1] += block[1];
735
        pixels[2] += block[2];
736
        pixels[3] += block[3];
737
        pixels += line_size;
738
        block += 4;
739
    }
740
}
741

    
742
static int sum_abs_dctelem_c(DCTELEM *block)
743
{
744
    int sum=0, i;
745
    for(i=0; i<64; i++)
746
        sum+= FFABS(block[i]);
747
    return sum;
748
}
749

    
750
#if 0
751

752
#define PIXOP2(OPNAME, OP) \
753
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
754
{\
755
    int i;\
756
    for(i=0; i<h; i++){\
757
        OP(*((uint64_t*)block), AV_RN64(pixels));\
758
        pixels+=line_size;\
759
        block +=line_size;\
760
    }\
761
}\
762
\
763
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
764
{\
765
    int i;\
766
    for(i=0; i<h; i++){\
767
        const uint64_t a= AV_RN64(pixels  );\
768
        const uint64_t b= AV_RN64(pixels+1);\
769
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
770
        pixels+=line_size;\
771
        block +=line_size;\
772
    }\
773
}\
774
\
775
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
776
{\
777
    int i;\
778
    for(i=0; i<h; i++){\
779
        const uint64_t a= AV_RN64(pixels  );\
780
        const uint64_t b= AV_RN64(pixels+1);\
781
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
782
        pixels+=line_size;\
783
        block +=line_size;\
784
    }\
785
}\
786
\
787
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
788
{\
789
    int i;\
790
    for(i=0; i<h; i++){\
791
        const uint64_t a= AV_RN64(pixels          );\
792
        const uint64_t b= AV_RN64(pixels+line_size);\
793
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
794
        pixels+=line_size;\
795
        block +=line_size;\
796
    }\
797
}\
798
\
799
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
800
{\
801
    int i;\
802
    for(i=0; i<h; i++){\
803
        const uint64_t a= AV_RN64(pixels          );\
804
        const uint64_t b= AV_RN64(pixels+line_size);\
805
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
806
        pixels+=line_size;\
807
        block +=line_size;\
808
    }\
809
}\
810
\
811
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
812
{\
813
        int i;\
814
        const uint64_t a= AV_RN64(pixels  );\
815
        const uint64_t b= AV_RN64(pixels+1);\
816
        uint64_t l0=  (a&0x0303030303030303ULL)\
817
                    + (b&0x0303030303030303ULL)\
818
                    + 0x0202020202020202ULL;\
819
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
821
        uint64_t l1,h1;\
822
\
823
        pixels+=line_size;\
824
        for(i=0; i<h; i+=2){\
825
            uint64_t a= AV_RN64(pixels  );\
826
            uint64_t b= AV_RN64(pixels+1);\
827
            l1=  (a&0x0303030303030303ULL)\
828
               + (b&0x0303030303030303ULL);\
829
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
832
            pixels+=line_size;\
833
            block +=line_size;\
834
            a= AV_RN64(pixels  );\
835
            b= AV_RN64(pixels+1);\
836
            l0=  (a&0x0303030303030303ULL)\
837
               + (b&0x0303030303030303ULL)\
838
               + 0x0202020202020202ULL;\
839
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
840
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
841
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
842
            pixels+=line_size;\
843
            block +=line_size;\
844
        }\
845
}\
846
\
847
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
848
{\
849
        int i;\
850
        const uint64_t a= AV_RN64(pixels  );\
851
        const uint64_t b= AV_RN64(pixels+1);\
852
        uint64_t l0=  (a&0x0303030303030303ULL)\
853
                    + (b&0x0303030303030303ULL)\
854
                    + 0x0101010101010101ULL;\
855
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
857
        uint64_t l1,h1;\
858
\
859
        pixels+=line_size;\
860
        for(i=0; i<h; i+=2){\
861
            uint64_t a= AV_RN64(pixels  );\
862
            uint64_t b= AV_RN64(pixels+1);\
863
            l1=  (a&0x0303030303030303ULL)\
864
               + (b&0x0303030303030303ULL);\
865
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
868
            pixels+=line_size;\
869
            block +=line_size;\
870
            a= AV_RN64(pixels  );\
871
            b= AV_RN64(pixels+1);\
872
            l0=  (a&0x0303030303030303ULL)\
873
               + (b&0x0303030303030303ULL)\
874
               + 0x0101010101010101ULL;\
875
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
876
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
877
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
878
            pixels+=line_size;\
879
            block +=line_size;\
880
        }\
881
}\
882
\
883
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
884
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
886
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
887
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
888
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
889
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
890

891
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
892
#else // 64 bit variant
893

    
894
#define PIXOP2(OPNAME, OP) \
895
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    int i;\
897
    for(i=0; i<h; i++){\
898
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
899
        pixels+=line_size;\
900
        block +=line_size;\
901
    }\
902
}\
903
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904
    int i;\
905
    for(i=0; i<h; i++){\
906
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
907
        pixels+=line_size;\
908
        block +=line_size;\
909
    }\
910
}\
911
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
912
    int i;\
913
    for(i=0; i<h; i++){\
914
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
915
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
916
        pixels+=line_size;\
917
        block +=line_size;\
918
    }\
919
}\
920
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
922
}\
923
\
924
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
925
                                                int src_stride1, int src_stride2, int h){\
926
    int i;\
927
    for(i=0; i<h; i++){\
928
        uint32_t a,b;\
929
        a= AV_RN32(&src1[i*src_stride1  ]);\
930
        b= AV_RN32(&src2[i*src_stride2  ]);\
931
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
932
        a= AV_RN32(&src1[i*src_stride1+4]);\
933
        b= AV_RN32(&src2[i*src_stride2+4]);\
934
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
935
    }\
936
}\
937
\
938
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
939
                                                int src_stride1, int src_stride2, int h){\
940
    int i;\
941
    for(i=0; i<h; i++){\
942
        uint32_t a,b;\
943
        a= AV_RN32(&src1[i*src_stride1  ]);\
944
        b= AV_RN32(&src2[i*src_stride2  ]);\
945
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
946
        a= AV_RN32(&src1[i*src_stride1+4]);\
947
        b= AV_RN32(&src2[i*src_stride2+4]);\
948
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
949
    }\
950
}\
951
\
952
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
953
                                                int src_stride1, int src_stride2, int h){\
954
    int i;\
955
    for(i=0; i<h; i++){\
956
        uint32_t a,b;\
957
        a= AV_RN32(&src1[i*src_stride1  ]);\
958
        b= AV_RN32(&src2[i*src_stride2  ]);\
959
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
960
    }\
961
}\
962
\
963
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
964
                                                int src_stride1, int src_stride2, int h){\
965
    int i;\
966
    for(i=0; i<h; i++){\
967
        uint32_t a,b;\
968
        a= AV_RN16(&src1[i*src_stride1  ]);\
969
        b= AV_RN16(&src2[i*src_stride2  ]);\
970
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
971
    }\
972
}\
973
\
974
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
975
                                                int src_stride1, int src_stride2, int h){\
976
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
977
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
978
}\
979
\
980
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
981
                                                int src_stride1, int src_stride2, int h){\
982
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
983
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
984
}\
985
\
986
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
988
}\
989
\
990
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
992
}\
993
\
994
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
996
}\
997
\
998
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
999
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1000
}\
1001
\
1002
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1003
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1004
    int i;\
1005
    for(i=0; i<h; i++){\
1006
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1007
        a= AV_RN32(&src1[i*src_stride1]);\
1008
        b= AV_RN32(&src2[i*src_stride2]);\
1009
        c= AV_RN32(&src3[i*src_stride3]);\
1010
        d= AV_RN32(&src4[i*src_stride4]);\
1011
        l0=  (a&0x03030303UL)\
1012
           + (b&0x03030303UL)\
1013
           + 0x02020202UL;\
1014
        h0= ((a&0xFCFCFCFCUL)>>2)\
1015
          + ((b&0xFCFCFCFCUL)>>2);\
1016
        l1=  (c&0x03030303UL)\
1017
           + (d&0x03030303UL);\
1018
        h1= ((c&0xFCFCFCFCUL)>>2)\
1019
          + ((d&0xFCFCFCFCUL)>>2);\
1020
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021
        a= AV_RN32(&src1[i*src_stride1+4]);\
1022
        b= AV_RN32(&src2[i*src_stride2+4]);\
1023
        c= AV_RN32(&src3[i*src_stride3+4]);\
1024
        d= AV_RN32(&src4[i*src_stride4+4]);\
1025
        l0=  (a&0x03030303UL)\
1026
           + (b&0x03030303UL)\
1027
           + 0x02020202UL;\
1028
        h0= ((a&0xFCFCFCFCUL)>>2)\
1029
          + ((b&0xFCFCFCFCUL)>>2);\
1030
        l1=  (c&0x03030303UL)\
1031
           + (d&0x03030303UL);\
1032
        h1= ((c&0xFCFCFCFCUL)>>2)\
1033
          + ((d&0xFCFCFCFCUL)>>2);\
1034
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035
    }\
1036
}\
1037
\
1038
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1040
}\
1041
\
1042
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1044
}\
1045
\
1046
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1048
}\
1049
\
1050
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1051
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1052
}\
1053
\
1054
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1055
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1056
    int i;\
1057
    for(i=0; i<h; i++){\
1058
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1059
        a= AV_RN32(&src1[i*src_stride1]);\
1060
        b= AV_RN32(&src2[i*src_stride2]);\
1061
        c= AV_RN32(&src3[i*src_stride3]);\
1062
        d= AV_RN32(&src4[i*src_stride4]);\
1063
        l0=  (a&0x03030303UL)\
1064
           + (b&0x03030303UL)\
1065
           + 0x01010101UL;\
1066
        h0= ((a&0xFCFCFCFCUL)>>2)\
1067
          + ((b&0xFCFCFCFCUL)>>2);\
1068
        l1=  (c&0x03030303UL)\
1069
           + (d&0x03030303UL);\
1070
        h1= ((c&0xFCFCFCFCUL)>>2)\
1071
          + ((d&0xFCFCFCFCUL)>>2);\
1072
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073
        a= AV_RN32(&src1[i*src_stride1+4]);\
1074
        b= AV_RN32(&src2[i*src_stride2+4]);\
1075
        c= AV_RN32(&src3[i*src_stride3+4]);\
1076
        d= AV_RN32(&src4[i*src_stride4+4]);\
1077
        l0=  (a&0x03030303UL)\
1078
           + (b&0x03030303UL)\
1079
           + 0x01010101UL;\
1080
        h0= ((a&0xFCFCFCFCUL)>>2)\
1081
          + ((b&0xFCFCFCFCUL)>>2);\
1082
        l1=  (c&0x03030303UL)\
1083
           + (d&0x03030303UL);\
1084
        h1= ((c&0xFCFCFCFCUL)>>2)\
1085
          + ((d&0xFCFCFCFCUL)>>2);\
1086
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087
    }\
1088
}\
1089
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1090
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1091
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093
}\
1094
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1095
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1096
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1098
}\
1099
\
1100
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1101
{\
1102
        int i, a0, b0, a1, b1;\
1103
        a0= pixels[0];\
1104
        b0= pixels[1] + 2;\
1105
        a0 += b0;\
1106
        b0 += pixels[2];\
1107
\
1108
        pixels+=line_size;\
1109
        for(i=0; i<h; i+=2){\
1110
            a1= pixels[0];\
1111
            b1= pixels[1];\
1112
            a1 += b1;\
1113
            b1 += pixels[2];\
1114
\
1115
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1116
            block[1]= (b1+b0)>>2;\
1117
\
1118
            pixels+=line_size;\
1119
            block +=line_size;\
1120
\
1121
            a0= pixels[0];\
1122
            b0= pixels[1] + 2;\
1123
            a0 += b0;\
1124
            b0 += pixels[2];\
1125
\
1126
            block[0]= (a1+a0)>>2;\
1127
            block[1]= (b1+b0)>>2;\
1128
            pixels+=line_size;\
1129
            block +=line_size;\
1130
        }\
1131
}\
1132
\
1133
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1134
{\
1135
        int i;\
1136
        const uint32_t a= AV_RN32(pixels  );\
1137
        const uint32_t b= AV_RN32(pixels+1);\
1138
        uint32_t l0=  (a&0x03030303UL)\
1139
                    + (b&0x03030303UL)\
1140
                    + 0x02020202UL;\
1141
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142
                   + ((b&0xFCFCFCFCUL)>>2);\
1143
        uint32_t l1,h1;\
1144
\
1145
        pixels+=line_size;\
1146
        for(i=0; i<h; i+=2){\
1147
            uint32_t a= AV_RN32(pixels  );\
1148
            uint32_t b= AV_RN32(pixels+1);\
1149
            l1=  (a&0x03030303UL)\
1150
               + (b&0x03030303UL);\
1151
            h1= ((a&0xFCFCFCFCUL)>>2)\
1152
              + ((b&0xFCFCFCFCUL)>>2);\
1153
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154
            pixels+=line_size;\
1155
            block +=line_size;\
1156
            a= AV_RN32(pixels  );\
1157
            b= AV_RN32(pixels+1);\
1158
            l0=  (a&0x03030303UL)\
1159
               + (b&0x03030303UL)\
1160
               + 0x02020202UL;\
1161
            h0= ((a&0xFCFCFCFCUL)>>2)\
1162
              + ((b&0xFCFCFCFCUL)>>2);\
1163
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164
            pixels+=line_size;\
1165
            block +=line_size;\
1166
        }\
1167
}\
1168
\
1169
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1170
{\
1171
    int j;\
1172
    for(j=0; j<2; j++){\
1173
        int i;\
1174
        const uint32_t a= AV_RN32(pixels  );\
1175
        const uint32_t b= AV_RN32(pixels+1);\
1176
        uint32_t l0=  (a&0x03030303UL)\
1177
                    + (b&0x03030303UL)\
1178
                    + 0x02020202UL;\
1179
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1180
                   + ((b&0xFCFCFCFCUL)>>2);\
1181
        uint32_t l1,h1;\
1182
\
1183
        pixels+=line_size;\
1184
        for(i=0; i<h; i+=2){\
1185
            uint32_t a= AV_RN32(pixels  );\
1186
            uint32_t b= AV_RN32(pixels+1);\
1187
            l1=  (a&0x03030303UL)\
1188
               + (b&0x03030303UL);\
1189
            h1= ((a&0xFCFCFCFCUL)>>2)\
1190
              + ((b&0xFCFCFCFCUL)>>2);\
1191
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1192
            pixels+=line_size;\
1193
            block +=line_size;\
1194
            a= AV_RN32(pixels  );\
1195
            b= AV_RN32(pixels+1);\
1196
            l0=  (a&0x03030303UL)\
1197
               + (b&0x03030303UL)\
1198
               + 0x02020202UL;\
1199
            h0= ((a&0xFCFCFCFCUL)>>2)\
1200
              + ((b&0xFCFCFCFCUL)>>2);\
1201
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1202
            pixels+=line_size;\
1203
            block +=line_size;\
1204
        }\
1205
        pixels+=4-line_size*(h+1);\
1206
        block +=4-line_size*h;\
1207
    }\
1208
}\
1209
\
1210
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1211
{\
1212
    int j;\
1213
    for(j=0; j<2; j++){\
1214
        int i;\
1215
        const uint32_t a= AV_RN32(pixels  );\
1216
        const uint32_t b= AV_RN32(pixels+1);\
1217
        uint32_t l0=  (a&0x03030303UL)\
1218
                    + (b&0x03030303UL)\
1219
                    + 0x01010101UL;\
1220
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1221
                   + ((b&0xFCFCFCFCUL)>>2);\
1222
        uint32_t l1,h1;\
1223
\
1224
        pixels+=line_size;\
1225
        for(i=0; i<h; i+=2){\
1226
            uint32_t a= AV_RN32(pixels  );\
1227
            uint32_t b= AV_RN32(pixels+1);\
1228
            l1=  (a&0x03030303UL)\
1229
               + (b&0x03030303UL);\
1230
            h1= ((a&0xFCFCFCFCUL)>>2)\
1231
              + ((b&0xFCFCFCFCUL)>>2);\
1232
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1233
            pixels+=line_size;\
1234
            block +=line_size;\
1235
            a= AV_RN32(pixels  );\
1236
            b= AV_RN32(pixels+1);\
1237
            l0=  (a&0x03030303UL)\
1238
               + (b&0x03030303UL)\
1239
               + 0x01010101UL;\
1240
            h0= ((a&0xFCFCFCFCUL)>>2)\
1241
              + ((b&0xFCFCFCFCUL)>>2);\
1242
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1243
            pixels+=line_size;\
1244
            block +=line_size;\
1245
        }\
1246
        pixels+=4-line_size*(h+1);\
1247
        block +=4-line_size*h;\
1248
    }\
1249
}\
1250
\
1251
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1252
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1255
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1256
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1257
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1258
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1259

    
1260
#define op_avg(a, b) a = rnd_avg32(a, b)
1261
#endif
1262
#define op_put(a, b) a = b
1263

    
1264
PIXOP2(avg, op_avg)
1265
PIXOP2(put, op_put)
1266
#undef op_avg
1267
#undef op_put
1268

    
1269
#define avg2(a,b) ((a+b+1)>>1)
1270
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1271

    
1272
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1274
}
1275

    
1276
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1277
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1278
}
1279

    
1280
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1281
{
1282
    const int A=(16-x16)*(16-y16);
1283
    const int B=(   x16)*(16-y16);
1284
    const int C=(16-x16)*(   y16);
1285
    const int D=(   x16)*(   y16);
1286
    int i;
1287

    
1288
    for(i=0; i<h; i++)
1289
    {
1290
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1291
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1292
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1293
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1294
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1295
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1296
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1297
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1298
        dst+= stride;
1299
        src+= stride;
1300
    }
1301
}
1302

    
1303
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1304
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1305
{
1306
    int y, vx, vy;
1307
    const int s= 1<<shift;
1308

    
1309
    width--;
1310
    height--;
1311

    
1312
    for(y=0; y<h; y++){
1313
        int x;
1314

    
1315
        vx= ox;
1316
        vy= oy;
1317
        for(x=0; x<8; x++){ //XXX FIXME optimize
1318
            int src_x, src_y, frac_x, frac_y, index;
1319

    
1320
            src_x= vx>>16;
1321
            src_y= vy>>16;
1322
            frac_x= src_x&(s-1);
1323
            frac_y= src_y&(s-1);
1324
            src_x>>=shift;
1325
            src_y>>=shift;
1326

    
1327
            if((unsigned)src_x < width){
1328
                if((unsigned)src_y < height){
1329
                    index= src_x + src_y*stride;
1330
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1331
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1332
                                        + (  src[index+stride  ]*(s-frac_x)
1333
                                           + src[index+stride+1]*   frac_x )*   frac_y
1334
                                        + r)>>(shift*2);
1335
                }else{
1336
                    index= src_x + av_clip(src_y, 0, height)*stride;
1337
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1338
                                          + src[index       +1]*   frac_x )*s
1339
                                        + r)>>(shift*2);
1340
                }
1341
            }else{
1342
                if((unsigned)src_y < height){
1343
                    index= av_clip(src_x, 0, width) + src_y*stride;
1344
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1345
                                           + src[index+stride  ]*   frac_y )*s
1346
                                        + r)>>(shift*2);
1347
                }else{
1348
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1349
                    dst[y*stride + x]=    src[index         ];
1350
                }
1351
            }
1352

    
1353
            vx+= dxx;
1354
            vy+= dyx;
1355
        }
1356
        ox += dxy;
1357
        oy += dyy;
1358
    }
1359
}
1360

    
1361
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362
    switch(width){
1363
    case 2: put_pixels2_c (dst, src, stride, height); break;
1364
    case 4: put_pixels4_c (dst, src, stride, height); break;
1365
    case 8: put_pixels8_c (dst, src, stride, height); break;
1366
    case 16:put_pixels16_c(dst, src, stride, height); break;
1367
    }
1368
}
1369

    
1370
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371
    int i,j;
1372
    for (i=0; i < height; i++) {
1373
      for (j=0; j < width; j++) {
1374
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1375
      }
1376
      src += stride;
1377
      dst += stride;
1378
    }
1379
}
1380

    
1381
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1382
    int i,j;
1383
    for (i=0; i < height; i++) {
1384
      for (j=0; j < width; j++) {
1385
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1386
      }
1387
      src += stride;
1388
      dst += stride;
1389
    }
1390
}
1391

    
1392
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393
    int i,j;
1394
    for (i=0; i < height; i++) {
1395
      for (j=0; j < width; j++) {
1396
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1397
      }
1398
      src += stride;
1399
      dst += stride;
1400
    }
1401
}
1402

    
1403
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404
    int i,j;
1405
    for (i=0; i < height; i++) {
1406
      for (j=0; j < width; j++) {
1407
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1408
      }
1409
      src += stride;
1410
      dst += stride;
1411
    }
1412
}
1413

    
1414
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415
    int i,j;
1416
    for (i=0; i < height; i++) {
1417
      for (j=0; j < width; j++) {
1418
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1419
      }
1420
      src += stride;
1421
      dst += stride;
1422
    }
1423
}
1424

    
1425
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426
    int i,j;
1427
    for (i=0; i < height; i++) {
1428
      for (j=0; j < width; j++) {
1429
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1430
      }
1431
      src += stride;
1432
      dst += stride;
1433
    }
1434
}
1435

    
1436
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437
    int i,j;
1438
    for (i=0; i < height; i++) {
1439
      for (j=0; j < width; j++) {
1440
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1441
      }
1442
      src += stride;
1443
      dst += stride;
1444
    }
1445
}
1446

    
1447
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448
    int i,j;
1449
    for (i=0; i < height; i++) {
1450
      for (j=0; j < width; j++) {
1451
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1452
      }
1453
      src += stride;
1454
      dst += stride;
1455
    }
1456
}
1457

    
1458
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459
    switch(width){
1460
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1461
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1462
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1463
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1464
    }
1465
}
1466

    
1467
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1468
    int i,j;
1469
    for (i=0; i < height; i++) {
1470
      for (j=0; j < width; j++) {
1471
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1472
      }
1473
      src += stride;
1474
      dst += stride;
1475
    }
1476
}
1477

    
1478
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1479
    int i,j;
1480
    for (i=0; i < height; i++) {
1481
      for (j=0; j < width; j++) {
1482
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1483
      }
1484
      src += stride;
1485
      dst += stride;
1486
    }
1487
}
1488

    
1489
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1490
    int i,j;
1491
    for (i=0; i < height; i++) {
1492
      for (j=0; j < width; j++) {
1493
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1494
      }
1495
      src += stride;
1496
      dst += stride;
1497
    }
1498
}
1499

    
1500
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1501
    int i,j;
1502
    for (i=0; i < height; i++) {
1503
      for (j=0; j < width; j++) {
1504
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1505
      }
1506
      src += stride;
1507
      dst += stride;
1508
    }
1509
}
1510

    
1511
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1512
    int i,j;
1513
    for (i=0; i < height; i++) {
1514
      for (j=0; j < width; j++) {
1515
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1516
      }
1517
      src += stride;
1518
      dst += stride;
1519
    }
1520
}
1521

    
1522
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1523
    int i,j;
1524
    for (i=0; i < height; i++) {
1525
      for (j=0; j < width; j++) {
1526
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1527
      }
1528
      src += stride;
1529
      dst += stride;
1530
    }
1531
}
1532

    
1533
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1534
    int i,j;
1535
    for (i=0; i < height; i++) {
1536
      for (j=0; j < width; j++) {
1537
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1538
      }
1539
      src += stride;
1540
      dst += stride;
1541
    }
1542
}
1543

    
1544
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1545
    int i,j;
1546
    for (i=0; i < height; i++) {
1547
      for (j=0; j < width; j++) {
1548
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1549
      }
1550
      src += stride;
1551
      dst += stride;
1552
    }
1553
}
1554
#if 0
1555
#define TPEL_WIDTH(width)\
1556
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1558
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1560
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1562
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1564
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1566
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1568
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1570
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1571
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1572
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1573
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1574
#endif
1575

    
1576
#define H264_CHROMA_MC(OPNAME, OP)\
1577
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1578
    const int A=(8-x)*(8-y);\
1579
    const int B=(  x)*(8-y);\
1580
    const int C=(8-x)*(  y);\
1581
    const int D=(  x)*(  y);\
1582
    int i;\
1583
    \
1584
    assert(x<8 && y<8 && x>=0 && y>=0);\
1585
\
1586
    if(D){\
1587
        for(i=0; i<h; i++){\
1588
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1589
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1590
            dst+= stride;\
1591
            src+= stride;\
1592
        }\
1593
    }else{\
1594
        const int E= B+C;\
1595
        const int step= C ? stride : 1;\
1596
        for(i=0; i<h; i++){\
1597
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1598
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1599
            dst+= stride;\
1600
            src+= stride;\
1601
        }\
1602
    }\
1603
}\
1604
\
1605
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1606
    const int A=(8-x)*(8-y);\
1607
    const int B=(  x)*(8-y);\
1608
    const int C=(8-x)*(  y);\
1609
    const int D=(  x)*(  y);\
1610
    int i;\
1611
    \
1612
    assert(x<8 && y<8 && x>=0 && y>=0);\
1613
\
1614
    if(D){\
1615
        for(i=0; i<h; i++){\
1616
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1617
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1618
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1619
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1620
            dst+= stride;\
1621
            src+= stride;\
1622
        }\
1623
    }else{\
1624
        const int E= B+C;\
1625
        const int step= C ? stride : 1;\
1626
        for(i=0; i<h; i++){\
1627
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1628
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1629
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1630
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1631
            dst+= stride;\
1632
            src+= stride;\
1633
        }\
1634
    }\
1635
}\
1636
\
1637
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1638
    const int A=(8-x)*(8-y);\
1639
    const int B=(  x)*(8-y);\
1640
    const int C=(8-x)*(  y);\
1641
    const int D=(  x)*(  y);\
1642
    int i;\
1643
    \
1644
    assert(x<8 && y<8 && x>=0 && y>=0);\
1645
\
1646
    if(D){\
1647
        for(i=0; i<h; i++){\
1648
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1649
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1650
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1651
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1652
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1653
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1654
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1655
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1656
            dst+= stride;\
1657
            src+= stride;\
1658
        }\
1659
    }else{\
1660
        const int E= B+C;\
1661
        const int step= C ? stride : 1;\
1662
        for(i=0; i<h; i++){\
1663
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1664
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1665
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1666
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1667
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1668
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1669
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1670
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1671
            dst+= stride;\
1672
            src+= stride;\
1673
        }\
1674
    }\
1675
}
1676

    
1677
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1678
#define op_put(a, b) a = (((b) + 32)>>6)
1679

    
1680
H264_CHROMA_MC(put_       , op_put)
1681
H264_CHROMA_MC(avg_       , op_avg)
1682
#undef op_avg
1683
#undef op_put
1684

    
1685
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1686
    const int A=(8-x)*(8-y);
1687
    const int B=(  x)*(8-y);
1688
    const int C=(8-x)*(  y);
1689
    const int D=(  x)*(  y);
1690
    int i;
1691

    
1692
    assert(x<8 && y<8 && x>=0 && y>=0);
1693

    
1694
    for(i=0; i<h; i++)
1695
    {
1696
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1697
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1698
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1699
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1700
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1701
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1702
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1703
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1704
        dst+= stride;
1705
        src+= stride;
1706
    }
1707
}
1708

    
1709
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1710
    const int A=(8-x)*(8-y);
1711
    const int B=(  x)*(8-y);
1712
    const int C=(8-x)*(  y);
1713
    const int D=(  x)*(  y);
1714
    int i;
1715

    
1716
    assert(x<8 && y<8 && x>=0 && y>=0);
1717

    
1718
    for(i=0; i<h; i++)
1719
    {
1720
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1721
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1722
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1723
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1724
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1725
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1726
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1727
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1728
        dst+= stride;
1729
        src+= stride;
1730
    }
1731
}
1732

    
1733
#define QPEL_MC(r, OPNAME, RND, OP) \
1734
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1735
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1736
    int i;\
1737
    for(i=0; i<h; i++)\
1738
    {\
1739
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1740
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1741
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1742
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1743
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1744
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1745
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1746
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1747
        dst+=dstStride;\
1748
        src+=srcStride;\
1749
    }\
1750
}\
1751
\
1752
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1753
    const int w=8;\
1754
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755
    int i;\
1756
    for(i=0; i<w; i++)\
1757
    {\
1758
        const int src0= src[0*srcStride];\
1759
        const int src1= src[1*srcStride];\
1760
        const int src2= src[2*srcStride];\
1761
        const int src3= src[3*srcStride];\
1762
        const int src4= src[4*srcStride];\
1763
        const int src5= src[5*srcStride];\
1764
        const int src6= src[6*srcStride];\
1765
        const int src7= src[7*srcStride];\
1766
        const int src8= src[8*srcStride];\
1767
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1768
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1769
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1770
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1771
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1772
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1773
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1774
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1775
        dst++;\
1776
        src++;\
1777
    }\
1778
}\
1779
\
1780
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1781
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782
    int i;\
1783
    \
1784
    for(i=0; i<h; i++)\
1785
    {\
1786
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1787
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1788
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1789
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1790
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1791
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1792
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1793
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1794
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1795
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1796
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1797
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1798
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1799
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1800
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1801
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1802
        dst+=dstStride;\
1803
        src+=srcStride;\
1804
    }\
1805
}\
1806
\
1807
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1808
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1809
    int i;\
1810
    const int w=16;\
1811
    for(i=0; i<w; i++)\
1812
    {\
1813
        const int src0= src[0*srcStride];\
1814
        const int src1= src[1*srcStride];\
1815
        const int src2= src[2*srcStride];\
1816
        const int src3= src[3*srcStride];\
1817
        const int src4= src[4*srcStride];\
1818
        const int src5= src[5*srcStride];\
1819
        const int src6= src[6*srcStride];\
1820
        const int src7= src[7*srcStride];\
1821
        const int src8= src[8*srcStride];\
1822
        const int src9= src[9*srcStride];\
1823
        const int src10= src[10*srcStride];\
1824
        const int src11= src[11*srcStride];\
1825
        const int src12= src[12*srcStride];\
1826
        const int src13= src[13*srcStride];\
1827
        const int src14= src[14*srcStride];\
1828
        const int src15= src[15*srcStride];\
1829
        const int src16= src[16*srcStride];\
1830
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1831
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1832
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1833
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1834
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1835
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1836
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1837
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1838
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1839
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1840
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1841
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1842
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1843
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1844
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1845
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1846
        dst++;\
1847
        src++;\
1848
    }\
1849
}\
1850
\
1851
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1852
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1853
}\
1854
\
1855
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1856
    uint8_t half[64];\
1857
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1858
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1859
}\
1860
\
1861
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1862
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1863
}\
1864
\
1865
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1866
    uint8_t half[64];\
1867
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1868
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1869
}\
1870
\
1871
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1872
    uint8_t full[16*9];\
1873
    uint8_t half[64];\
1874
    copy_block9(full, src, 16, stride, 9);\
1875
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1876
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1877
}\
1878
\
1879
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1880
    uint8_t full[16*9];\
1881
    copy_block9(full, src, 16, stride, 9);\
1882
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1883
}\
1884
\
1885
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1886
    uint8_t full[16*9];\
1887
    uint8_t half[64];\
1888
    copy_block9(full, src, 16, stride, 9);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1890
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1891
}\
1892
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[16*9];\
1894
    uint8_t halfH[72];\
1895
    uint8_t halfV[64];\
1896
    uint8_t halfHV[64];\
1897
    copy_block9(full, src, 16, stride, 9);\
1898
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1900
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1902
}\
1903
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1904
    uint8_t full[16*9];\
1905
    uint8_t halfH[72];\
1906
    uint8_t halfHV[64];\
1907
    copy_block9(full, src, 16, stride, 9);\
1908
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1909
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1910
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1912
}\
1913
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[16*9];\
1915
    uint8_t halfH[72];\
1916
    uint8_t halfV[64];\
1917
    uint8_t halfHV[64];\
1918
    copy_block9(full, src, 16, stride, 9);\
1919
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1921
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1923
}\
1924
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1925
    uint8_t full[16*9];\
1926
    uint8_t halfH[72];\
1927
    uint8_t halfHV[64];\
1928
    copy_block9(full, src, 16, stride, 9);\
1929
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1931
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1933
}\
1934
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935
    uint8_t full[16*9];\
1936
    uint8_t halfH[72];\
1937
    uint8_t halfV[64];\
1938
    uint8_t halfHV[64];\
1939
    copy_block9(full, src, 16, stride, 9);\
1940
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1942
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1944
}\
1945
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1946
    uint8_t full[16*9];\
1947
    uint8_t halfH[72];\
1948
    uint8_t halfHV[64];\
1949
    copy_block9(full, src, 16, stride, 9);\
1950
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1951
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1952
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1954
}\
1955
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[16*9];\
1957
    uint8_t halfH[72];\
1958
    uint8_t halfV[64];\
1959
    uint8_t halfHV[64];\
1960
    copy_block9(full, src, 16, stride, 9);\
1961
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1962
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1965
}\
1966
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[16*9];\
1968
    uint8_t halfH[72];\
1969
    uint8_t halfHV[64];\
1970
    copy_block9(full, src, 16, stride, 9);\
1971
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1974
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1975
}\
1976
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1977
    uint8_t halfH[72];\
1978
    uint8_t halfHV[64];\
1979
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1980
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1981
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1982
}\
1983
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t halfH[72];\
1985
    uint8_t halfHV[64];\
1986
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1987
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1988
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1989
}\
1990
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991
    uint8_t full[16*9];\
1992
    uint8_t halfH[72];\
1993
    uint8_t halfV[64];\
1994
    uint8_t halfHV[64];\
1995
    copy_block9(full, src, 16, stride, 9);\
1996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1998
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1999
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2000
}\
2001
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2002
    uint8_t full[16*9];\
2003
    uint8_t halfH[72];\
2004
    copy_block9(full, src, 16, stride, 9);\
2005
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2006
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2007
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2008
}\
2009
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    uint8_t full[16*9];\
2011
    uint8_t halfH[72];\
2012
    uint8_t halfV[64];\
2013
    uint8_t halfHV[64];\
2014
    copy_block9(full, src, 16, stride, 9);\
2015
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2016
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2019
}\
2020
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t full[16*9];\
2022
    uint8_t halfH[72];\
2023
    copy_block9(full, src, 16, stride, 9);\
2024
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2025
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2026
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2027
}\
2028
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2029
    uint8_t halfH[72];\
2030
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2031
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2032
}\
2033
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2034
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2035
}\
2036
\
2037
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2038
    uint8_t half[256];\
2039
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2040
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2041
}\
2042
\
2043
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2044
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2045
}\
2046
\
2047
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2048
    uint8_t half[256];\
2049
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2050
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2051
}\
2052
\
2053
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2054
    uint8_t full[24*17];\
2055
    uint8_t half[256];\
2056
    copy_block17(full, src, 24, stride, 17);\
2057
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2058
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2059
}\
2060
\
2061
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2062
    uint8_t full[24*17];\
2063
    copy_block17(full, src, 24, stride, 17);\
2064
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2065
}\
2066
\
2067
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2068
    uint8_t full[24*17];\
2069
    uint8_t half[256];\
2070
    copy_block17(full, src, 24, stride, 17);\
2071
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2072
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2073
}\
2074
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2075
    uint8_t full[24*17];\
2076
    uint8_t halfH[272];\
2077
    uint8_t halfV[256];\
2078
    uint8_t halfHV[256];\
2079
    copy_block17(full, src, 24, stride, 17);\
2080
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2082
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2084
}\
2085
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2086
    uint8_t full[24*17];\
2087
    uint8_t halfH[272];\
2088
    uint8_t halfHV[256];\
2089
    copy_block17(full, src, 24, stride, 17);\
2090
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2091
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2092
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2094
}\
2095
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096
    uint8_t full[24*17];\
2097
    uint8_t halfH[272];\
2098
    uint8_t halfV[256];\
2099
    uint8_t halfHV[256];\
2100
    copy_block17(full, src, 24, stride, 17);\
2101
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2105
}\
2106
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    uint8_t full[24*17];\
2108
    uint8_t halfH[272];\
2109
    uint8_t halfHV[256];\
2110
    copy_block17(full, src, 24, stride, 17);\
2111
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2114
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2115
}\
2116
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2117
    uint8_t full[24*17];\
2118
    uint8_t halfH[272];\
2119
    uint8_t halfV[256];\
2120
    uint8_t halfHV[256];\
2121
    copy_block17(full, src, 24, stride, 17);\
2122
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2124
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2126
}\
2127
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2128
    uint8_t full[24*17];\
2129
    uint8_t halfH[272];\
2130
    uint8_t halfHV[256];\
2131
    copy_block17(full, src, 24, stride, 17);\
2132
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2133
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2134
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2136
}\
2137
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t full[24*17];\
2139
    uint8_t halfH[272];\
2140
    uint8_t halfV[256];\
2141
    uint8_t halfHV[256];\
2142
    copy_block17(full, src, 24, stride, 17);\
2143
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2144
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2147
}\
2148
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2149
    uint8_t full[24*17];\
2150
    uint8_t halfH[272];\
2151
    uint8_t halfHV[256];\
2152
    copy_block17(full, src, 24, stride, 17);\
2153
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2156
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2157
}\
2158
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2159
    uint8_t halfH[272];\
2160
    uint8_t halfHV[256];\
2161
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2162
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2163
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2164
}\
2165
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2166
    uint8_t halfH[272];\
2167
    uint8_t halfHV[256];\
2168
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2169
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2170
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2171
}\
2172
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2173
    uint8_t full[24*17];\
2174
    uint8_t halfH[272];\
2175
    uint8_t halfV[256];\
2176
    uint8_t halfHV[256];\
2177
    copy_block17(full, src, 24, stride, 17);\
2178
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2180
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2181
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2182
}\
2183
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2184
    uint8_t full[24*17];\
2185
    uint8_t halfH[272];\
2186
    copy_block17(full, src, 24, stride, 17);\
2187
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2188
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2189
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2190
}\
2191
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192
    uint8_t full[24*17];\
2193
    uint8_t halfH[272];\
2194
    uint8_t halfV[256];\
2195
    uint8_t halfHV[256];\
2196
    copy_block17(full, src, 24, stride, 17);\
2197
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2198
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2201
}\
2202
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2203
    uint8_t full[24*17];\
2204
    uint8_t halfH[272];\
2205
    copy_block17(full, src, 24, stride, 17);\
2206
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2207
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2208
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2209
}\
2210
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2211
    uint8_t halfH[272];\
2212
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2213
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2214
}
2215

    
2216
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2217
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2218
#define op_put(a, b) a = cm[((b) + 16)>>5]
2219
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2220

    
2221
QPEL_MC(0, put_       , _       , op_put)
2222
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2223
QPEL_MC(0, avg_       , _       , op_avg)
2224
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2225
#undef op_avg
2226
#undef op_avg_no_rnd
2227
#undef op_put
2228
#undef op_put_no_rnd
2229

    
2230
#if 1
2231
#define H264_LOWPASS(OPNAME, OP, OP2) \
2232
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2233
    const int h=2;\
2234
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2235
    int i;\
2236
    for(i=0; i<h; i++)\
2237
    {\
2238
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2239
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2240
        dst+=dstStride;\
2241
        src+=srcStride;\
2242
    }\
2243
}\
2244
\
2245
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2246
    const int w=2;\
2247
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248
    int i;\
2249
    for(i=0; i<w; i++)\
2250
    {\
2251
        const int srcB= src[-2*srcStride];\
2252
        const int srcA= src[-1*srcStride];\
2253
        const int src0= src[0 *srcStride];\
2254
        const int src1= src[1 *srcStride];\
2255
        const int src2= src[2 *srcStride];\
2256
        const int src3= src[3 *srcStride];\
2257
        const int src4= src[4 *srcStride];\
2258
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2259
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2260
        dst++;\
2261
        src++;\
2262
    }\
2263
}\
2264
\
2265
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2266
    const int h=2;\
2267
    const int w=2;\
2268
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269
    int i;\
2270
    src -= 2*srcStride;\
2271
    for(i=0; i<h+5; i++)\
2272
    {\
2273
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2274
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2275
        tmp+=tmpStride;\
2276
        src+=srcStride;\
2277
    }\
2278
    tmp -= tmpStride*(h+5-2);\
2279
    for(i=0; i<w; i++)\
2280
    {\
2281
        const int tmpB= tmp[-2*tmpStride];\
2282
        const int tmpA= tmp[-1*tmpStride];\
2283
        const int tmp0= tmp[0 *tmpStride];\
2284
        const int tmp1= tmp[1 *tmpStride];\
2285
        const int tmp2= tmp[2 *tmpStride];\
2286
        const int tmp3= tmp[3 *tmpStride];\
2287
        const int tmp4= tmp[4 *tmpStride];\
2288
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290
        dst++;\
2291
        tmp++;\
2292
    }\
2293
}\
2294
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295
    const int h=4;\
2296
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297
    int i;\
2298
    for(i=0; i<h; i++)\
2299
    {\
2300
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2303
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2304
        dst+=dstStride;\
2305
        src+=srcStride;\
2306
    }\
2307
}\
2308
\
2309
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2310
    const int w=4;\
2311
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2312
    int i;\
2313
    for(i=0; i<w; i++)\
2314
    {\
2315
        const int srcB= src[-2*srcStride];\
2316
        const int srcA= src[-1*srcStride];\
2317
        const int src0= src[0 *srcStride];\
2318
        const int src1= src[1 *srcStride];\
2319
        const int src2= src[2 *srcStride];\
2320
        const int src3= src[3 *srcStride];\
2321
        const int src4= src[4 *srcStride];\
2322
        const int src5= src[5 *srcStride];\
2323
        const int src6= src[6 *srcStride];\
2324
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2325
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2326
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2327
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2328
        dst++;\
2329
        src++;\
2330
    }\
2331
}\
2332
\
2333
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2334
    const int h=4;\
2335
    const int w=4;\
2336
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337
    int i;\
2338
    src -= 2*srcStride;\
2339
    for(i=0; i<h+5; i++)\
2340
    {\
2341
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2342
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2343
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2344
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2345
        tmp+=tmpStride;\
2346
        src+=srcStride;\
2347
    }\
2348
    tmp -= tmpStride*(h+5-2);\
2349
    for(i=0; i<w; i++)\
2350
    {\
2351
        const int tmpB= tmp[-2*tmpStride];\
2352
        const int tmpA= tmp[-1*tmpStride];\
2353
        const int tmp0= tmp[0 *tmpStride];\
2354
        const int tmp1= tmp[1 *tmpStride];\
2355
        const int tmp2= tmp[2 *tmpStride];\
2356
        const int tmp3= tmp[3 *tmpStride];\
2357
        const int tmp4= tmp[4 *tmpStride];\
2358
        const int tmp5= tmp[5 *tmpStride];\
2359
        const int tmp6= tmp[6 *tmpStride];\
2360
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2361
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2362
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2363
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2364
        dst++;\
2365
        tmp++;\
2366
    }\
2367
}\
2368
\
2369
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2370
    const int h=8;\
2371
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2372
    int i;\
2373
    for(i=0; i<h; i++)\
2374
    {\
2375
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2376
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2377
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2378
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2379
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2380
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2381
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2382
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2383
        dst+=dstStride;\
2384
        src+=srcStride;\
2385
    }\
2386
}\
2387
\
2388
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389
    const int w=8;\
2390
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2391
    int i;\
2392
    for(i=0; i<w; i++)\
2393
    {\
2394
        const int srcB= src[-2*srcStride];\
2395
        const int srcA= src[-1*srcStride];\
2396
        const int src0= src[0 *srcStride];\
2397
        const int src1= src[1 *srcStride];\
2398
        const int src2= src[2 *srcStride];\
2399
        const int src3= src[3 *srcStride];\
2400
        const int src4= src[4 *srcStride];\
2401
        const int src5= src[5 *srcStride];\
2402
        const int src6= src[6 *srcStride];\
2403
        const int src7= src[7 *srcStride];\
2404
        const int src8= src[8 *srcStride];\
2405
        const int src9= src[9 *srcStride];\
2406
        const int src10=src[10*srcStride];\
2407
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2408
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2409
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2410
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2411
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2412
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2413
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2414
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2415
        dst++;\
2416
        src++;\
2417
    }\
2418
}\
2419
\
2420
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2421
    const int h=8;\
2422
    const int w=8;\
2423
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2424
    int i;\
2425
    src -= 2*srcStride;\
2426
    for(i=0; i<h+5; i++)\
2427
    {\
2428
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2429
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2430
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2431
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2432
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2433
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2434
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2435
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2436
        tmp+=tmpStride;\
2437
        src+=srcStride;\
2438
    }\
2439
    tmp -= tmpStride*(h+5-2);\
2440
    for(i=0; i<w; i++)\
2441
    {\
2442
        const int tmpB= tmp[-2*tmpStride];\
2443
        const int tmpA= tmp[-1*tmpStride];\
2444
        const int tmp0= tmp[0 *tmpStride];\
2445
        const int tmp1= tmp[1 *tmpStride];\
2446
        const int tmp2= tmp[2 *tmpStride];\
2447
        const int tmp3= tmp[3 *tmpStride];\
2448
        const int tmp4= tmp[4 *tmpStride];\
2449
        const int tmp5= tmp[5 *tmpStride];\
2450
        const int tmp6= tmp[6 *tmpStride];\
2451
        const int tmp7= tmp[7 *tmpStride];\
2452
        const int tmp8= tmp[8 *tmpStride];\
2453
        const int tmp9= tmp[9 *tmpStride];\
2454
        const int tmp10=tmp[10*tmpStride];\
2455
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2456
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2457
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2458
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2459
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2460
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2461
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2462
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2463
        dst++;\
2464
        tmp++;\
2465
    }\
2466
}\
2467
\
2468
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2469
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2470
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2471
    src += 8*srcStride;\
2472
    dst += 8*dstStride;\
2473
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2474
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2475
}\
2476
\
2477
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2478
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2479
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2480
    src += 8*srcStride;\
2481
    dst += 8*dstStride;\
2482
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2483
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2484
}\
2485
\
2486
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2487
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2488
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2489
    src += 8*srcStride;\
2490
    dst += 8*dstStride;\
2491
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2492
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2493
}\
2494

    
2495
#define H264_MC(OPNAME, SIZE) \
2496
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2497
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2498
}\
2499
\
2500
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2501
    uint8_t half[SIZE*SIZE];\
2502
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2503
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2504
}\
2505
\
2506
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2507
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2508
}\
2509
\
2510
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2511
    uint8_t half[SIZE*SIZE];\
2512
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2513
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2514
}\
2515
\
2516
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2517
    uint8_t full[SIZE*(SIZE+5)];\
2518
    uint8_t * const full_mid= full + SIZE*2;\
2519
    uint8_t half[SIZE*SIZE];\
2520
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2521
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2522
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2523
}\
2524
\
2525
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2526
    uint8_t full[SIZE*(SIZE+5)];\
2527
    uint8_t * const full_mid= full + SIZE*2;\
2528
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2529
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2530
}\
2531
\
2532
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2533
    uint8_t full[SIZE*(SIZE+5)];\
2534
    uint8_t * const full_mid= full + SIZE*2;\
2535
    uint8_t half[SIZE*SIZE];\
2536
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2537
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2538
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2539
}\
2540
\
2541
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2542
    uint8_t full[SIZE*(SIZE+5)];\
2543
    uint8_t * const full_mid= full + SIZE*2;\
2544
    uint8_t halfH[SIZE*SIZE];\
2545
    uint8_t halfV[SIZE*SIZE];\
2546
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2547
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2548
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2549
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2550
}\
2551
\
2552
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2553
    uint8_t full[SIZE*(SIZE+5)];\
2554
    uint8_t * const full_mid= full + SIZE*2;\
2555
    uint8_t halfH[SIZE*SIZE];\
2556
    uint8_t halfV[SIZE*SIZE];\
2557
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2558
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2559
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2560
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2561
}\
2562
\
2563
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2564
    uint8_t full[SIZE*(SIZE+5)];\
2565
    uint8_t * const full_mid= full + SIZE*2;\
2566
    uint8_t halfH[SIZE*SIZE];\
2567
    uint8_t halfV[SIZE*SIZE];\
2568
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2569
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2570
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2571
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2572
}\
2573
\
2574
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2575
    uint8_t full[SIZE*(SIZE+5)];\
2576
    uint8_t * const full_mid= full + SIZE*2;\
2577
    uint8_t halfH[SIZE*SIZE];\
2578
    uint8_t halfV[SIZE*SIZE];\
2579
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2580
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2581
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2583
}\
2584
\
2585
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2586
    int16_t tmp[SIZE*(SIZE+5)];\
2587
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2588
}\
2589
\
2590
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2591
    int16_t tmp[SIZE*(SIZE+5)];\
2592
    uint8_t halfH[SIZE*SIZE];\
2593
    uint8_t halfHV[SIZE*SIZE];\
2594
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2595
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2596
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2597
}\
2598
\
2599
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2600
    int16_t tmp[SIZE*(SIZE+5)];\
2601
    uint8_t halfH[SIZE*SIZE];\
2602
    uint8_t halfHV[SIZE*SIZE];\
2603
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2604
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2605
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2606
}\
2607
\
2608
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2609
    uint8_t full[SIZE*(SIZE+5)];\
2610
    uint8_t * const full_mid= full + SIZE*2;\
2611
    int16_t tmp[SIZE*(SIZE+5)];\
2612
    uint8_t halfV[SIZE*SIZE];\
2613
    uint8_t halfHV[SIZE*SIZE];\
2614
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2615
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2616
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2617
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2618
}\
2619
\
2620
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2621
    uint8_t full[SIZE*(SIZE+5)];\
2622
    uint8_t * const full_mid= full + SIZE*2;\
2623
    int16_t tmp[SIZE*(SIZE+5)];\
2624
    uint8_t halfV[SIZE*SIZE];\
2625
    uint8_t halfHV[SIZE*SIZE];\
2626
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2627
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2629
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2630
}\
2631

    
2632
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2633
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2634
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2635
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2636
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2637

    
2638
H264_LOWPASS(put_       , op_put, op2_put)
2639
H264_LOWPASS(avg_       , op_avg, op2_avg)
2640
H264_MC(put_, 2)
2641
H264_MC(put_, 4)
2642
H264_MC(put_, 8)
2643
H264_MC(put_, 16)
2644
H264_MC(avg_, 4)
2645
H264_MC(avg_, 8)
2646
H264_MC(avg_, 16)
2647

    
2648
#undef op_avg
2649
#undef op_put
2650
#undef op2_avg
2651
#undef op2_put
2652
#endif
2653

    
2654
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2655
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2656
#define H264_WEIGHT(W,H) \
2657
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2658
    int y; \
2659
    offset <<= log2_denom; \
2660
    if(log2_denom) offset += 1<<(log2_denom-1); \
2661
    for(y=0; y<H; y++, block += stride){ \
2662
        op_scale1(0); \
2663
        op_scale1(1); \
2664
        if(W==2) continue; \
2665
        op_scale1(2); \
2666
        op_scale1(3); \
2667
        if(W==4) continue; \
2668
        op_scale1(4); \
2669
        op_scale1(5); \
2670
        op_scale1(6); \
2671
        op_scale1(7); \
2672
        if(W==8) continue; \
2673
        op_scale1(8); \
2674
        op_scale1(9); \
2675
        op_scale1(10); \
2676
        op_scale1(11); \
2677
        op_scale1(12); \
2678
        op_scale1(13); \
2679
        op_scale1(14); \
2680
        op_scale1(15); \
2681
    } \
2682
} \
2683
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2684
    int y; \
2685
    offset = ((offset + 1) | 1) << log2_denom; \
2686
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2687
        op_scale2(0); \
2688
        op_scale2(1); \
2689
        if(W==2) continue; \
2690
        op_scale2(2); \
2691
        op_scale2(3); \
2692
        if(W==4) continue; \
2693
        op_scale2(4); \
2694
        op_scale2(5); \
2695
        op_scale2(6); \
2696
        op_scale2(7); \
2697
        if(W==8) continue; \
2698
        op_scale2(8); \
2699
        op_scale2(9); \
2700
        op_scale2(10); \
2701
        op_scale2(11); \
2702
        op_scale2(12); \
2703
        op_scale2(13); \
2704
        op_scale2(14); \
2705
        op_scale2(15); \
2706
    } \
2707
}
2708

    
2709
H264_WEIGHT(16,16)
2710
H264_WEIGHT(16,8)
2711
H264_WEIGHT(8,16)
2712
H264_WEIGHT(8,8)
2713
H264_WEIGHT(8,4)
2714
H264_WEIGHT(4,8)
2715
H264_WEIGHT(4,4)
2716
H264_WEIGHT(4,2)
2717
H264_WEIGHT(2,4)
2718
H264_WEIGHT(2,2)
2719

    
2720
#undef op_scale1
2721
#undef op_scale2
2722
#undef H264_WEIGHT
2723

    
2724
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2725
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2726
    int i;
2727

    
2728
    for(i=0; i<h; i++){
2729
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2730
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2731
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2732
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2733
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2734
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2735
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2736
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2737
        dst+=dstStride;
2738
        src+=srcStride;
2739
    }
2740
}
2741

    
2742
#if CONFIG_CAVS_DECODER
2743
/* AVS specific */
2744
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2745

    
2746
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747
    put_pixels8_c(dst, src, stride, 8);
2748
}
2749
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750
    avg_pixels8_c(dst, src, stride, 8);
2751
}
2752
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753
    put_pixels16_c(dst, src, stride, 16);
2754
}
2755
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2756
    avg_pixels16_c(dst, src, stride, 16);
2757
}
2758
#endif /* CONFIG_CAVS_DECODER */
2759

    
2760
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2761

    
2762
#if CONFIG_VC1_DECODER
2763
/* VC-1 specific */
2764
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2765

    
2766
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767
    put_pixels8_c(dst, src, stride, 8);
2768
}
2769
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2770
    avg_pixels8_c(dst, src, stride, 8);
2771
}
2772
#endif /* CONFIG_VC1_DECODER */
2773

    
2774
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2775

    
2776
/* H264 specific */
2777
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2778

    
2779
#if CONFIG_RV30_DECODER
2780
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2781
#endif /* CONFIG_RV30_DECODER */
2782

    
2783
#if CONFIG_RV40_DECODER
2784
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785
    put_pixels16_xy2_c(dst, src, stride, 16);
2786
}
2787
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788
    avg_pixels16_xy2_c(dst, src, stride, 16);
2789
}
2790
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791
    put_pixels8_xy2_c(dst, src, stride, 8);
2792
}
2793
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2794
    avg_pixels8_xy2_c(dst, src, stride, 8);
2795
}
2796

    
2797
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2798
#endif /* CONFIG_RV40_DECODER */
2799

    
2800
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2801
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2802
    int i;
2803

    
2804
    for(i=0; i<w; i++){
2805
        const int src_1= src[ -srcStride];
2806
        const int src0 = src[0          ];
2807
        const int src1 = src[  srcStride];
2808
        const int src2 = src[2*srcStride];
2809
        const int src3 = src[3*srcStride];
2810
        const int src4 = src[4*srcStride];
2811
        const int src5 = src[5*srcStride];
2812
        const int src6 = src[6*srcStride];
2813
        const int src7 = src[7*srcStride];
2814
        const int src8 = src[8*srcStride];
2815
        const int src9 = src[9*srcStride];
2816
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2817
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2818
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2819
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2820
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2821
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2822
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2823
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2824
        src++;
2825
        dst++;
2826
    }
2827
}
2828

    
2829
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2830
    put_pixels8_c(dst, src, stride, 8);
2831
}
2832

    
2833
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2834
    uint8_t half[64];
2835
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2836
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2837
}
2838

    
2839
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2840
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2841
}
2842

    
2843
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2844
    uint8_t half[64];
2845
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2846
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2847
}
2848

    
2849
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2850
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2851
}
2852

    
2853
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2854
    uint8_t halfH[88];
2855
    uint8_t halfV[64];
2856
    uint8_t halfHV[64];
2857
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2858
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2859
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2860
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2861
}
2862
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2863
    uint8_t halfH[88];
2864
    uint8_t halfV[64];
2865
    uint8_t halfHV[64];
2866
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2867
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2868
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2869
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2870
}
2871
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2872
    uint8_t halfH[88];
2873
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2874
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2875
}
2876

    
2877
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2878
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2879
    int x;
2880
    const int strength= ff_h263_loop_filter_strength[qscale];
2881

    
2882
    for(x=0; x<8; x++){
2883
        int d1, d2, ad1;
2884
        int p0= src[x-2*stride];
2885
        int p1= src[x-1*stride];
2886
        int p2= src[x+0*stride];
2887
        int p3= src[x+1*stride];
2888
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2889

    
2890
        if     (d<-2*strength) d1= 0;
2891
        else if(d<-  strength) d1=-2*strength - d;
2892
        else if(d<   strength) d1= d;
2893
        else if(d< 2*strength) d1= 2*strength - d;
2894
        else                   d1= 0;
2895

    
2896
        p1 += d1;
2897
        p2 -= d1;
2898
        if(p1&256) p1= ~(p1>>31);
2899
        if(p2&256) p2= ~(p2>>31);
2900

    
2901
        src[x-1*stride] = p1;
2902
        src[x+0*stride] = p2;
2903

    
2904
        ad1= FFABS(d1)>>1;
2905

    
2906
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2907

    
2908
        src[x-2*stride] = p0 - d2;
2909
        src[x+  stride] = p3 + d2;
2910
    }
2911
    }
2912
}
2913

    
2914
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2915
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2916
    int y;
2917
    const int strength= ff_h263_loop_filter_strength[qscale];
2918

    
2919
    for(y=0; y<8; y++){
2920
        int d1, d2, ad1;
2921
        int p0= src[y*stride-2];
2922
        int p1= src[y*stride-1];
2923
        int p2= src[y*stride+0];
2924
        int p3= src[y*stride+1];
2925
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2926

    
2927
        if     (d<-2*strength) d1= 0;
2928
        else if(d<-  strength) d1=-2*strength - d;
2929
        else if(d<   strength) d1= d;
2930
        else if(d< 2*strength) d1= 2*strength - d;
2931
        else                   d1= 0;
2932

    
2933
        p1 += d1;
2934
        p2 -= d1;
2935
        if(p1&256) p1= ~(p1>>31);
2936
        if(p2&256) p2= ~(p2>>31);
2937

    
2938
        src[y*stride-1] = p1;
2939
        src[y*stride+0] = p2;
2940

    
2941
        ad1= FFABS(d1)>>1;
2942

    
2943
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2944

    
2945
        src[y*stride-2] = p0 - d2;
2946
        src[y*stride+1] = p3 + d2;
2947
    }
2948
    }
2949
}
2950

    
2951
static void h261_loop_filter_c(uint8_t *src, int stride){
2952
    int x,y,xy,yz;
2953
    int temp[64];
2954

    
2955
    for(x=0; x<8; x++){
2956
        temp[x      ] = 4*src[x           ];
2957
        temp[x + 7*8] = 4*src[x + 7*stride];
2958
    }
2959
    for(y=1; y<7; y++){
2960
        for(x=0; x<8; x++){
2961
            xy = y * stride + x;
2962
            yz = y * 8 + x;
2963
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2964
        }
2965
    }
2966

    
2967
    for(y=0; y<8; y++){
2968
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2969
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2970
        for(x=1; x<7; x++){
2971
            xy = y * stride + x;
2972
            yz = y * 8 + x;
2973
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2974
        }
2975
    }
2976
}
2977

    
2978
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2979
{
2980
    int i, d;
2981
    for( i = 0; i < 4; i++ ) {
2982
        if( tc0[i] < 0 ) {
2983
            pix += 4*ystride;
2984
            continue;
2985
        }
2986
        for( d = 0; d < 4; d++ ) {
2987
            const int p0 = pix[-1*xstride];
2988
            const int p1 = pix[-2*xstride];
2989
            const int p2 = pix[-3*xstride];
2990
            const int q0 = pix[0];
2991
            const int q1 = pix[1*xstride];
2992
            const int q2 = pix[2*xstride];
2993

    
2994
            if( FFABS( p0 - q0 ) < alpha &&
2995
                FFABS( p1 - p0 ) < beta &&
2996
                FFABS( q1 - q0 ) < beta ) {
2997

    
2998
                int tc = tc0[i];
2999
                int i_delta;
3000

    
3001
                if( FFABS( p2 - p0 ) < beta ) {
3002
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3003
                    tc++;
3004
                }
3005
                if( FFABS( q2 - q0 ) < beta ) {
3006
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3007
                    tc++;
3008
                }
3009

    
3010
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3011
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3012
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3013
            }
3014
            pix += ystride;
3015
        }
3016
    }
3017
}
3018
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3019
{
3020
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3021
}
3022
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3023
{
3024
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3025
}
3026

    
3027
static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3028
{
3029
    int d;
3030
    for( d = 0; d < 16; d++ ) {
3031
        const int p2 = pix[-3*xstride];
3032
        const int p1 = pix[-2*xstride];
3033
        const int p0 = pix[-1*xstride];
3034

    
3035
        const int q0 = pix[ 0*xstride];
3036
        const int q1 = pix[ 1*xstride];
3037
        const int q2 = pix[ 2*xstride];
3038

    
3039
        if( FFABS( p0 - q0 ) < alpha &&
3040
            FFABS( p1 - p0 ) < beta &&
3041
            FFABS( q1 - q0 ) < beta ) {
3042

    
3043
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3044
                if( FFABS( p2 - p0 ) < beta)
3045
                {
3046
                    const int p3 = pix[-4*xstride];
3047
                    /* p0', p1', p2' */
3048
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3049
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3050
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3051
                } else {
3052
                    /* p0' */
3053
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3054
                }
3055
                if( FFABS( q2 - q0 ) < beta)
3056
                {
3057
                    const int q3 = pix[3*xstride];
3058
                    /* q0', q1', q2' */
3059
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3060
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3061
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3062
                } else {
3063
                    /* q0' */
3064
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3065
                }
3066
            }else{
3067
                /* p0', q0' */
3068
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3069
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3070
            }
3071
        }
3072
        pix += ystride;
3073
    }
3074
}
3075
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3076
{
3077
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3078
}
3079
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3080
{
3081
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3082
}
3083

    
3084
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3085
{
3086
    int i, d;
3087
    for( i = 0; i < 4; i++ ) {
3088
        const int tc = tc0[i];
3089
        if( tc <= 0 ) {
3090
            pix += 2*ystride;
3091
            continue;
3092
        }
3093
        for( d = 0; d < 2; d++ ) {
3094
            const int p0 = pix[-1*xstride];
3095
            const int p1 = pix[-2*xstride];
3096
            const int q0 = pix[0];
3097
            const int q1 = pix[1*xstride];
3098

    
3099
            if( FFABS( p0 - q0 ) < alpha &&
3100
                FFABS( p1 - p0 ) < beta &&
3101
                FFABS( q1 - q0 ) < beta ) {
3102

    
3103
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3104

    
3105
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3106
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3107
            }
3108
            pix += ystride;
3109
        }
3110
    }
3111
}
3112
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3113
{
3114
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3115
}
3116
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3117
{
3118
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3119
}
3120

    
3121
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3122
{
3123
    int d;
3124
    for( d = 0; d < 8; d++ ) {
3125
        const int p0 = pix[-1*xstride];
3126
        const int p1 = pix[-2*xstride];
3127
        const int q0 = pix[0];
3128
        const int q1 = pix[1*xstride];
3129

    
3130
        if( FFABS( p0 - q0 ) < alpha &&
3131
            FFABS( p1 - p0 ) < beta &&
3132
            FFABS( q1 - q0 ) < beta ) {
3133

    
3134
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3135
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3136
        }
3137
        pix += ystride;
3138
    }
3139
}
3140
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3141
{
3142
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3143
}
3144
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3145
{
3146
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3147
}
3148

    
3149
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3150
{
3151
    int s, i;
3152

    
3153
    s = 0;
3154
    for(i=0;i<h;i++) {
3155
        s += abs(pix1[0] - pix2[0]);
3156
        s += abs(pix1[1] - pix2[1]);
3157
        s += abs(pix1[2] - pix2[2]);
3158
        s += abs(pix1[3] - pix2[3]);
3159
        s += abs(pix1[4] - pix2[4]);
3160
        s += abs(pix1[5] - pix2[5]);
3161
        s += abs(pix1[6] - pix2[6]);
3162
        s += abs(pix1[7] - pix2[7]);
3163
        s += abs(pix1[8] - pix2[8]);
3164
        s += abs(pix1[9] - pix2[9]);
3165
        s += abs(pix1[10] - pix2[10]);
3166
        s += abs(pix1[11] - pix2[11]);
3167
        s += abs(pix1[12] - pix2[12]);
3168
        s += abs(pix1[13] - pix2[13]);
3169
        s += abs(pix1[14] - pix2[14]);
3170
        s += abs(pix1[15] - pix2[15]);
3171
        pix1 += line_size;
3172
        pix2 += line_size;
3173
    }
3174
    return s;
3175
}
3176

    
3177
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3178
{
3179
    int s, i;
3180

    
3181
    s = 0;
3182
    for(i=0;i<h;i++) {
3183
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3184
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3185
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3186
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3187
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3188
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3189
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3190
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3191
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3192
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3193
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3194
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3195
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3196
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3197
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3198
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3199
        pix1 += line_size;
3200
        pix2 += line_size;
3201
    }
3202
    return s;
3203
}
3204

    
3205
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3206
{
3207
    int s, i;
3208
    uint8_t *pix3 = pix2 + line_size;
3209

    
3210
    s = 0;
3211
    for(i=0;i<h;i++) {
3212
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3213
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3214
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3215
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3216
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3217
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3218
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3219
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3220
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3221
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3222
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3223
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3224
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3225
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3226
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3227
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3228
        pix1 += line_size;
3229
        pix2 += line_size;
3230
        pix3 += line_size;
3231
    }
3232
    return s;
3233
}
3234

    
3235
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3236
{
3237
    int s, i;
3238
    uint8_t *pix3 = pix2 + line_size;
3239

    
3240
    s = 0;
3241
    for(i=0;i<h;i++) {
3242
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3243
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3244
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3245
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3246
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3247
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3248
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3249
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3250
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3251
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3252
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3253
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3254
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3255
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3256
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3257
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3258
        pix1 += line_size;
3259
        pix2 += line_size;
3260
        pix3 += line_size;
3261
    }
3262
    return s;
3263
}
3264

    
3265
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3266
{
3267
    int s, i;
3268

    
3269
    s = 0;
3270
    for(i=0;i<h;i++) {
3271
        s += abs(pix1[0] - pix2[0]);
3272
        s += abs(pix1[1] - pix2[1]);
3273
        s += abs(pix1[2] - pix2[2]);
3274
        s += abs(pix1[3] - pix2[3]);
3275
        s += abs(pix1[4] - pix2[4]);
3276
        s += abs(pix1[5] - pix2[5]);
3277
        s += abs(pix1[6] - pix2[6]);
3278
        s += abs(pix1[7] - pix2[7]);
3279
        pix1 += line_size;
3280
        pix2 += line_size;
3281
    }
3282
    return s;
3283
}
3284

    
3285
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3286
{
3287
    int s, i;
3288

    
3289
    s = 0;
3290
    for(i=0;i<h;i++) {
3291
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3292
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3293
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3294
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3295
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3296
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3297
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3298
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3299
        pix1 += line_size;
3300
        pix2 += line_size;
3301
    }
3302
    return s;
3303
}
3304

    
3305
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3306
{
3307
    int s, i;
3308
    uint8_t *pix3 = pix2 + line_size;
3309

    
3310
    s = 0;
3311
    for(i=0;i<h;i++) {
3312
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3313
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3314
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3315
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3316
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3317
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3318
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3319
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3320
        pix1 += line_size;
3321
        pix2 += line_size;
3322
        pix3 += line_size;
3323
    }
3324
    return s;
3325
}
3326

    
3327
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3328
{
3329
    int s, i;
3330
    uint8_t *pix3 = pix2 + line_size;
3331

    
3332
    s = 0;
3333
    for(i=0;i<h;i++) {
3334
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3335
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3336
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3337
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3338
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3339
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3340
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3341
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3342
        pix1 += line_size;
3343
        pix2 += line_size;
3344
        pix3 += line_size;
3345
    }
3346
    return s;
3347
}
3348

    
3349
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3350
    MpegEncContext *c = v;
3351
    int score1=0;
3352
    int score2=0;
3353
    int x,y;
3354

    
3355
    for(y=0; y<h; y++){
3356
        for(x=0; x<16; x++){
3357
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3358
        }
3359
        if(y+1<h){
3360
            for(x=0; x<15; x++){
3361
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3362
                             - s1[x+1] + s1[x+1+stride])
3363
                        -FFABS(  s2[x  ] - s2[x  +stride]
3364
                             - s2[x+1] + s2[x+1+stride]);
3365
            }
3366
        }
3367
        s1+= stride;
3368
        s2+= stride;
3369
    }
3370

    
3371
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3372
    else  return score1 + FFABS(score2)*8;
3373
}
3374

    
3375
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3376
    MpegEncContext *c = v;
3377
    int score1=0;
3378
    int score2=0;
3379
    int x,y;
3380

    
3381
    for(y=0; y<h; y++){
3382
        for(x=0; x<8; x++){
3383
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3384
        }
3385
        if(y+1<h){
3386
            for(x=0; x<7; x++){
3387
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3388
                             - s1[x+1] + s1[x+1+stride])
3389
                        -FFABS(  s2[x  ] - s2[x  +stride]
3390
                             - s2[x+1] + s2[x+1+stride]);
3391
            }
3392
        }
3393
        s1+= stride;
3394
        s2+= stride;
3395
    }
3396

    
3397
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3398
    else  return score1 + FFABS(score2)*8;
3399
}
3400

    
3401
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3402
    int i;
3403
    unsigned int sum=0;
3404

    
3405
    for(i=0; i<8*8; i++){
3406
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3407
        int w= weight[i];
3408
        b>>= RECON_SHIFT;
3409
        assert(-512<b && b<512);
3410

    
3411
        sum += (w*b)*(w*b)>>4;
3412
    }
3413
    return sum>>2;
3414
}
3415

    
3416
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3417
    int i;
3418

    
3419
    for(i=0; i<8*8; i++){
3420
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3421
    }
3422
}
3423

    
3424
/**
3425
 * permutes an 8x8 block.
3426
 * @param block the block which will be permuted according to the given permutation vector
3427
 * @param permutation the permutation vector
3428
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3429
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3430
 *                  (inverse) permutated to scantable order!
3431
 */
3432
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3433
{
3434
    int i;
3435
    DCTELEM temp[64];
3436

    
3437
    if(last<=0) return;
3438
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3439

    
3440
    for(i=0; i<=last; i++){
3441
        const int j= scantable[i];
3442
        temp[j]= block[j];
3443
        block[j]=0;
3444
    }
3445

    
3446
    for(i=0; i<=last; i++){
3447
        const int j= scantable[i];
3448
        const int perm_j= permutation[j];
3449
        block[perm_j]= temp[j];
3450
    }
3451
}
3452

    
3453
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3454
    return 0;
3455
}
3456

    
3457
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3458
    int i;
3459

    
3460
    memset(cmp, 0, sizeof(void*)*6);
3461

    
3462
    for(i=0; i<6; i++){
3463
        switch(type&0xFF){
3464
        case FF_CMP_SAD:
3465
            cmp[i]= c->sad[i];
3466
            break;
3467
        case FF_CMP_SATD:
3468
            cmp[i]= c->hadamard8_diff[i];
3469
            break;
3470
        case FF_CMP_SSE:
3471
            cmp[i]= c->sse[i];
3472
            break;
3473
        case FF_CMP_DCT:
3474
            cmp[i]= c->dct_sad[i];
3475
            break;
3476
        case FF_CMP_DCT264:
3477
            cmp[i]= c->dct264_sad[i];
3478
            break;
3479
        case FF_CMP_DCTMAX:
3480
            cmp[i]= c->dct_max[i];
3481
            break;
3482
        case FF_CMP_PSNR:
3483
            cmp[i]= c->quant_psnr[i];
3484
            break;
3485
        case FF_CMP_BIT:
3486
            cmp[i]= c->bit[i];
3487
            break;
3488
        case FF_CMP_RD:
3489
            cmp[i]= c->rd[i];
3490
            break;
3491
        case FF_CMP_VSAD:
3492
            cmp[i]= c->vsad[i];
3493
            break;
3494
        case FF_CMP_VSSE:
3495
            cmp[i]= c->vsse[i];
3496
            break;
3497
        case FF_CMP_ZERO:
3498
            cmp[i]= zero_cmp;
3499
            break;
3500
        case FF_CMP_NSSE:
3501
            cmp[i]= c->nsse[i];
3502
            break;
3503
#if CONFIG_SNOW_ENCODER
3504
        case FF_CMP_W53:
3505
            cmp[i]= c->w53[i];
3506
            break;
3507
        case FF_CMP_W97:
3508
            cmp[i]= c->w97[i];
3509
            break;
3510
#endif
3511
        default:
3512
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3513
        }
3514
    }
3515
}
3516

    
3517
static void clear_block_c(DCTELEM *block)
3518
{
3519
    memset(block, 0, sizeof(DCTELEM)*64);
3520
}
3521

    
3522
/**
3523
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3524
 */
3525
static void clear_blocks_c(DCTELEM *blocks)
3526
{
3527
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3528
}
3529

    
3530
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3531
    long i;
3532
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3533
        long a = *(long*)(src+i);
3534
        long b = *(long*)(dst+i);
3535
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3536
    }
3537
    for(; i<w; i++)
3538
        dst[i+0] += src[i+0];
3539
}
3540

    
3541
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3542
    long i;
3543
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3544
        long a = *(long*)(src1+i);
3545
        long b = *(long*)(src2+i);
3546
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3547
    }
3548
    for(; i<w; i++)
3549
        dst[i] = src1[i]+src2[i];
3550
}
3551

    
3552
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3553
    long i;
3554
#if !HAVE_FAST_UNALIGNED
3555
    if((long)src2 & (sizeof(long)-1)){
3556
        for(i=0; i+7<w; i+=8){
3557
            dst[i+0] = src1[i+0]-src2[i+0];
3558
            dst[i+1] = src1[i+1]-src2[i+1];
3559
            dst[i+2] = src1[i+2]-src2[i+2];
3560
            dst[i+3] = src1[i+3]-src2[i+3];
3561
            dst[i+4] = src1[i+4]-src2[i+4];
3562
            dst[i+5] = src1[i+5]-src2[i+5];
3563
            dst[i+6] = src1[i+6]-src2[i+6];
3564
            dst[i+7] = src1[i+7]-src2[i+7];
3565
        }
3566
    }else
3567
#endif
3568
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3569
        long a = *(long*)(src1+i);
3570
        long b = *(long*)(src2+i);
3571
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3572
    }
3573
    for(; i<w; i++)
3574
        dst[i+0] = src1[i+0]-src2[i+0];
3575
}
3576

    
3577
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3578
    int i;
3579
    uint8_t l, lt;
3580

    
3581
    l= *left;
3582
    lt= *left_top;
3583

    
3584
    for(i=0; i<w; i++){
3585
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3586
        lt= src1[i];
3587
        dst[i]= l;
3588
    }
3589

    
3590
    *left= l;
3591
    *left_top= lt;
3592
}
3593

    
3594
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3595
    int i;
3596
    uint8_t l, lt;
3597

    
3598
    l= *left;
3599
    lt= *left_top;
3600

    
3601
    for(i=0; i<w; i++){
3602
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3603
        lt= src1[i];
3604
        l= src2[i];
3605
        dst[i]= l - pred;
3606
    }
3607

    
3608
    *left= l;
3609
    *left_top= lt;
3610
}
3611

    
3612
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3613
    int i;
3614

    
3615
    for(i=0; i<w-1; i++){
3616
        acc+= src[i];
3617
        dst[i]= acc;
3618
        i++;
3619
        acc+= src[i];
3620
        dst[i]= acc;
3621
    }
3622

    
3623
    for(; i<w; i++){
3624
        acc+= src[i];
3625
        dst[i]= acc;
3626
    }
3627

    
3628
    return acc;
3629
}
3630

    
3631
#if HAVE_BIGENDIAN
3632
#define B 3
3633
#define G 2
3634
#define R 1
3635
#else
3636
#define B 0
3637
#define G 1
3638
#define R 2
3639
#endif
3640
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue){
3641
    int i;
3642
    int r,g,b;
3643
    r= *red;
3644
    g= *green;
3645
    b= *blue;
3646

    
3647
    for(i=0; i<w; i++){
3648
        b+= src[4*i+B];
3649
        g+= src[4*i+G];
3650
        r+= src[4*i+R];
3651

    
3652
        dst[4*i+B]= b;
3653
        dst[4*i+G]= g;
3654
        dst[4*i+R]= r;
3655
    }
3656

    
3657
    *red= r;
3658
    *green= g;
3659
    *blue= b;
3660
}
3661
#undef B
3662
#undef G
3663
#undef R
3664

    
3665
#define BUTTERFLY2(o1,o2,i1,i2) \
3666
o1= (i1)+(i2);\
3667
o2= (i1)-(i2);
3668

    
3669
#define BUTTERFLY1(x,y) \
3670
{\
3671
    int a,b;\
3672
    a= x;\
3673
    b= y;\
3674
    x= a+b;\
3675
    y= a-b;\
3676
}
3677

    
3678
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3679

    
3680
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3681
    int i;
3682
    int temp[64];
3683
    int sum=0;
3684

    
3685
    assert(h==8);
3686

    
3687
    for(i=0; i<8; i++){
3688
        //FIXME try pointer walks
3689
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3690
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3691
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3692
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3693

    
3694
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3695
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3696
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3697
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3698

    
3699
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3700
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3701
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3702
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3703
    }
3704

    
3705
    for(i=0; i<8; i++){
3706
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3707
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3708
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3709
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3710

    
3711
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3712
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3713
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3714
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3715

    
3716
        sum +=
3717
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3718
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3719
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3720
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3721
    }
3722
#if 0
3723
static int maxi=0;
3724
if(sum>maxi){
3725
    maxi=sum;
3726
    printf("MAX:%d\n", maxi);
3727
}
3728
#endif
3729
    return sum;
3730
}
3731

    
3732
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3733
    int i;
3734
    int temp[64];
3735
    int sum=0;
3736

    
3737
    assert(h==8);
3738

    
3739
    for(i=0; i<8; i++){
3740
        //FIXME try pointer walks
3741
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3742
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3743
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3744
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3745

    
3746
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3747
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3748
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3749
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3750

    
3751
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3752
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3753
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3754
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3755
    }
3756

    
3757
    for(i=0; i<8; i++){
3758
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3759
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3760
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3761
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3762

    
3763
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3764
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3765
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3766
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3767

    
3768
        sum +=
3769
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3770
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3771
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3772
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3773
    }
3774

    
3775
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3776

    
3777
    return sum;
3778
}
3779

    
3780
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3781
    MpegEncContext * const s= (MpegEncContext *)c;
3782
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3783
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3784

    
3785
    assert(h==8);
3786

    
3787
    s->dsp.diff_pixels(temp, src1, src2, stride);
3788
    s->dsp.fdct(temp);
3789
    return s->dsp.sum_abs_dctelem(temp);
3790
}
3791

    
3792
#if CONFIG_GPL
3793
#define DCT8_1D {\
3794
    const int s07 = SRC(0) + SRC(7);\
3795
    const int s16 = SRC(1) + SRC(6);\
3796
    const int s25 = SRC(2) + SRC(5);\
3797
    const int s34 = SRC(3) + SRC(4);\
3798
    const int a0 = s07 + s34;\
3799
    const int a1 = s16 + s25;\
3800
    const int a2 = s07 - s34;\
3801
    const int a3 = s16 - s25;\
3802
    const int d07 = SRC(0) - SRC(7);\
3803
    const int d16 = SRC(1) - SRC(6);\
3804
    const int d25 = SRC(2) - SRC(5);\
3805
    const int d34 = SRC(3) - SRC(4);\
3806
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3807
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3808
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3809
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3810
    DST(0,  a0 + a1     ) ;\
3811
    DST(1,  a4 + (a7>>2)) ;\
3812
    DST(2,  a2 + (a3>>1)) ;\
3813
    DST(3,  a5 + (a6>>2)) ;\
3814
    DST(4,  a0 - a1     ) ;\
3815
    DST(5,  a6 - (a5>>2)) ;\
3816
    DST(6, (a2>>1) - a3 ) ;\
3817
    DST(7, (a4>>2) - a7 ) ;\
3818
}
3819

    
3820
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3821
    MpegEncContext * const s= (MpegEncContext *)c;
3822
    DCTELEM dct[8][8];
3823
    int i;
3824
    int sum=0;
3825

    
3826
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3827

    
3828
#define SRC(x) dct[i][x]
3829
#define DST(x,v) dct[i][x]= v
3830
    for( i = 0; i < 8; i++ )
3831
        DCT8_1D
3832
#undef SRC
3833
#undef DST
3834

    
3835
#define SRC(x) dct[x][i]
3836
#define DST(x,v) sum += FFABS(v)
3837
    for( i = 0; i < 8; i++ )
3838
        DCT8_1D
3839
#undef SRC
3840
#undef DST
3841
    return sum;
3842
}
3843
#endif
3844

    
3845
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3846
    MpegEncContext * const s= (MpegEncContext *)c;
3847
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3848
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3849
    int sum=0, i;
3850

    
3851
    assert(h==8);
3852

    
3853
    s->dsp.diff_pixels(temp, src1, src2, stride);
3854
    s->dsp.fdct(temp);
3855

    
3856
    for(i=0; i<64; i++)
3857
        sum= FFMAX(sum, FFABS(temp[i]));
3858

    
3859
    return sum;
3860
}
3861

    
3862
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3863
    MpegEncContext * const s= (MpegEncContext *)c;
3864
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3865
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3866
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3867
    int sum=0, i;
3868

    
3869
    assert(h==8);
3870
    s->mb_intra=0;
3871

    
3872
    s->dsp.diff_pixels(temp, src1, src2, stride);
3873

    
3874
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3875

    
3876
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3877
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3878
    ff_simple_idct(temp); //FIXME
3879

    
3880
    for(i=0; i<64; i++)
3881
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3882

    
3883
    return sum;
3884
}
3885

    
3886
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3887
    MpegEncContext * const s= (MpegEncContext *)c;
3888
    const uint8_t *scantable= s->intra_scantable.permutated;
3889
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3890
    DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3891
    DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3892
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3893
    uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3894
    uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3895
    int i, last, run, bits, level, distortion, start_i;
3896
    const int esc_length= s->ac_esc_length;
3897
    uint8_t * length;
3898
    uint8_t * last_length;
3899

    
3900
    assert(h==8);
3901

    
3902
    copy_block8(lsrc1, src1, 8, stride, 8);
3903
    copy_block8(lsrc2, src2, 8, stride, 8);
3904

    
3905
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3906

    
3907
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3908

    
3909
    bits=0;
3910

    
3911
    if (s->mb_intra) {
3912
        start_i = 1;
3913
        length     = s->intra_ac_vlc_length;
3914
        last_length= s->intra_ac_vlc_last_length;
3915
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3916
    } else {
3917
        start_i = 0;
3918
        length     = s->inter_ac_vlc_length;
3919
        last_length= s->inter_ac_vlc_last_length;
3920
    }
3921

    
3922
    if(last>=start_i){
3923
        run=0;
3924
        for(i=start_i; i<last; i++){
3925
            int j= scantable[i];
3926
            level= temp[j];
3927

    
3928
            if(level){
3929
                level+=64;
3930
                if((level&(~127)) == 0){
3931
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3932
                }else
3933
                    bits+= esc_length;
3934
                run=0;
3935
            }else
3936
                run++;
3937
        }
3938
        i= scantable[last];
3939

    
3940
        level= temp[i] + 64;
3941

    
3942
        assert(level - 64);
3943

    
3944
        if((level&(~127)) == 0){
3945
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3946
        }else
3947
            bits+= esc_length;
3948

    
3949
    }
3950

    
3951
    if(last>=0){
3952
        if(s->mb_intra)
3953
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3954
        else
3955
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3956
    }
3957

    
3958
    s->dsp.idct_add(lsrc2, 8, temp);
3959

    
3960
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3961

    
3962
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3963
}
3964

    
3965
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3966
    MpegEncContext * const s= (MpegEncContext *)c;
3967
    const uint8_t *scantable= s->intra_scantable.permutated;
3968
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3969
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3970
    int i, last, run, bits, level, start_i;
3971
    const int esc_length= s->ac_esc_length;
3972
    uint8_t * length;
3973
    uint8_t * last_length;
3974

    
3975
    assert(h==8);
3976

    
3977
    s->dsp.diff_pixels(temp, src1, src2, stride);
3978

    
3979
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3980

    
3981
    bits=0;
3982

    
3983
    if (s->mb_intra) {
3984
        start_i = 1;
3985
        length     = s->intra_ac_vlc_length;
3986
        last_length= s->intra_ac_vlc_last_length;
3987
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3988
    } else {
3989
        start_i = 0;
3990
        length     = s->inter_ac_vlc_length;
3991
        last_length= s->inter_ac_vlc_last_length;
3992
    }
3993

    
3994
    if(last>=start_i){
3995
        run=0;
3996
        for(i=start_i; i<last; i++){
3997
            int j= scantable[i];
3998
            level= temp[j];
3999

    
4000
            if(level){
4001
                level+=64;
4002
                if((level&(~127)) == 0){
4003
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
4004
                }else
4005
                    bits+= esc_length;
4006
                run=0;
4007
            }else
4008
                run++;
4009
        }
4010
        i= scantable[last];
4011

    
4012
        level= temp[i] + 64;
4013

    
4014
        assert(level - 64);
4015

    
4016
        if((level&(~127)) == 0){
4017
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4018
        }else
4019
            bits+= esc_length;
4020
    }
4021

    
4022
    return bits;
4023
}
4024

    
4025
#define VSAD_INTRA(size) \
4026
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4027
    int score=0;                                                                                            \
4028
    int x,y;                                                                                                \
4029
                                                                                                            \
4030
    for(y=1; y<h; y++){                                                                                     \
4031
        for(x=0; x<size; x+=4){                                                                             \
4032
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4033
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4034
        }                                                                                                   \
4035
        s+= stride;                                                                                         \
4036
    }                                                                                                       \
4037
                                                                                                            \
4038
    return score;                                                                                           \
4039
}
4040
VSAD_INTRA(8)
4041
VSAD_INTRA(16)
4042

    
4043
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4044
    int score=0;
4045
    int x,y;
4046

    
4047
    for(y=1; y<h; y++){
4048
        for(x=0; x<16; x++){
4049
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4050
        }
4051
        s1+= stride;
4052
        s2+= stride;
4053
    }
4054

    
4055
    return score;
4056
}
4057

    
4058
#define SQ(a) ((a)*(a))
4059
#define VSSE_INTRA(size) \
4060
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4061
    int score=0;                                                                                            \
4062
    int x,y;                                                                                                \
4063
                                                                                                            \
4064
    for(y=1; y<h; y++){                                                                                     \
4065
        for(x=0; x<size; x+=4){                                                                               \
4066
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4067
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4068
        }                                                                                                   \
4069
        s+= stride;                                                                                         \
4070
    }                                                                                                       \
4071
                                                                                                            \
4072
    return score;                                                                                           \
4073
}
4074
VSSE_INTRA(8)
4075
VSSE_INTRA(16)
4076

    
4077
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4078
    int score=0;
4079
    int x,y;
4080

    
4081
    for(y=1; y<h; y++){
4082
        for(x=0; x<16; x++){
4083
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4084
        }
4085
        s1+= stride;
4086
        s2+= stride;
4087
    }
4088

    
4089
    return score;
4090
}
4091

    
4092
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4093
                               int size){
4094
    int score=0;
4095
    int i;
4096
    for(i=0; i<size; i++)
4097
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4098
    return score;
4099
}
4100

    
4101
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4102
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4103
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4104
#if CONFIG_GPL
4105
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4106
#endif
4107
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4108
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4109
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4110
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4111

    
4112
static void vector_fmul_c(float *dst, const float *src, int len){
4113
    int i;
4114
    for(i=0; i<len; i++)
4115
        dst[i] *= src[i];
4116
}
4117

    
4118
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4119
    int i;
4120
    src1 += len-1;
4121
    for(i=0; i<len; i++)
4122
        dst[i] = src0[i] * src1[-i];
4123
}
4124

    
4125
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4126
    int i;
4127
    for(i=0; i<len; i++)
4128
        dst[i] = src0[i] * src1[i] + src2[i];
4129
}
4130

    
4131
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4132
    int i,j;
4133
    dst += len;
4134
    win += len;
4135
    src0+= len;
4136
    for(i=-len, j=len-1; i<0; i++, j--) {
4137
        float s0 = src0[i];
4138
        float s1 = src1[j];
4139
        float wi = win[i];
4140
        float wj = win[j];
4141
        dst[i] = s0*wj - s1*wi + add_bias;
4142
        dst[j] = s0*wi + s1*wj + add_bias;
4143
    }
4144
}
4145

    
4146
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4147
                                 int len)
4148
{
4149
    int i;
4150
    for (i = 0; i < len; i++)
4151
        dst[i] = src[i] * mul;
4152
}
4153

    
4154
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4155
                                      const float **sv, float mul, int len)
4156
{
4157
    int i;
4158
    for (i = 0; i < len; i += 2, sv++) {
4159
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4160
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4161
    }
4162
}
4163

    
4164
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4165
                                      const float **sv, float mul, int len)
4166
{
4167
    int i;
4168
    for (i = 0; i < len; i += 4, sv++) {
4169
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4170
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4171
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4172
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4173
    }
4174
}
4175

    
4176
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4177
                               int len)
4178
{
4179
    int i;
4180
    for (i = 0; i < len; i += 2, sv++) {
4181
        dst[i  ] = sv[0][0] * mul;
4182
        dst[i+1] = sv[0][1] * mul;
4183
    }
4184
}
4185

    
4186
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4187
                               int len)
4188
{
4189
    int i;
4190
    for (i = 0; i < len; i += 4, sv++) {
4191
        dst[i  ] = sv[0][0] * mul;
4192
        dst[i+1] = sv[0][1] * mul;
4193
        dst[i+2] = sv[0][2] * mul;
4194
        dst[i+3] = sv[0][3] * mul;
4195
    }
4196
}
4197

    
4198
static void butterflies_float_c(float *restrict v1, float *restrict v2,
4199
                                int len)
4200
{
4201
    int i;
4202
    for (i = 0; i < len; i++) {
4203
        float t = v1[i] - v2[i];
4204
        v1[i] += v2[i];
4205
        v2[i] = t;
4206
    }
4207
}
4208

    
4209
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4210
{
4211
    float p = 0.0;
4212
    int i;
4213

    
4214
    for (i = 0; i < len; i++)
4215
        p += v1[i] * v2[i];
4216

    
4217
    return p;
4218
}
4219

    
4220
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4221
    int i;
4222
    for(i=0; i<len; i++)
4223
        dst[i] = src[i] * mul;
4224
}
4225

    
4226
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4227
                   uint32_t maxi, uint32_t maxisign)
4228
{
4229

    
4230
    if(a > mini) return mini;
4231
    else if((a^(1<<31)) > maxisi