Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 2480c390

History | View | Annotate | Download (173 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "snow.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39

    
40
/* snow.c */
41
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42

    
43
/* vorbis.c */
44
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45

    
46
/* ac3dec.c */
47
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48

    
49
/* lpc.c */
50
void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
51

    
52
/* pngdec.c */
53
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54

    
55
/* eaidct.c */
56
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57

    
58
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
59
uint32_t ff_squareTbl[512] = {0, };
60

    
61
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
62
#define pb_7f (~0UL/255 * 0x7f)
63
#define pb_80 (~0UL/255 * 0x80)
64

    
65
const uint8_t ff_zigzag_direct[64] = {
66
    0,   1,  8, 16,  9,  2,  3, 10,
67
    17, 24, 32, 25, 18, 11,  4,  5,
68
    12, 19, 26, 33, 40, 48, 41, 34,
69
    27, 20, 13,  6,  7, 14, 21, 28,
70
    35, 42, 49, 56, 57, 50, 43, 36,
71
    29, 22, 15, 23, 30, 37, 44, 51,
72
    58, 59, 52, 45, 38, 31, 39, 46,
73
    53, 60, 61, 54, 47, 55, 62, 63
74
};
75

    
76
/* Specific zigzag scan for 248 idct. NOTE that unlike the
77
   specification, we interleave the fields */
78
const uint8_t ff_zigzag248_direct[64] = {
79
     0,  8,  1,  9, 16, 24,  2, 10,
80
    17, 25, 32, 40, 48, 56, 33, 41,
81
    18, 26,  3, 11,  4, 12, 19, 27,
82
    34, 42, 49, 57, 50, 58, 35, 43,
83
    20, 28,  5, 13,  6, 14, 21, 29,
84
    36, 44, 51, 59, 52, 60, 37, 45,
85
    22, 30,  7, 15, 23, 31, 38, 46,
86
    53, 61, 54, 62, 39, 47, 55, 63,
87
};
88

    
89
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
90
DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64];
91

    
92
const uint8_t ff_alternate_horizontal_scan[64] = {
93
    0,  1,   2,  3,  8,  9, 16, 17,
94
    10, 11,  4,  5,  6,  7, 15, 14,
95
    13, 12, 19, 18, 24, 25, 32, 33,
96
    26, 27, 20, 21, 22, 23, 28, 29,
97
    30, 31, 34, 35, 40, 41, 48, 49,
98
    42, 43, 36, 37, 38, 39, 44, 45,
99
    46, 47, 50, 51, 56, 57, 58, 59,
100
    52, 53, 54, 55, 60, 61, 62, 63,
101
};
102

    
103
const uint8_t ff_alternate_vertical_scan[64] = {
104
    0,  8,  16, 24,  1,  9,  2, 10,
105
    17, 25, 32, 40, 48, 56, 57, 49,
106
    41, 33, 26, 18,  3, 11,  4, 12,
107
    19, 27, 34, 42, 50, 58, 35, 43,
108
    51, 59, 20, 28,  5, 13,  6, 14,
109
    21, 29, 36, 44, 52, 60, 37, 45,
110
    53, 61, 22, 30,  7, 15, 23, 31,
111
    38, 46, 54, 62, 39, 47, 55, 63,
112
};
113

    
114
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
115
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
116
const uint32_t ff_inverse[257]={
117
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
118
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
119
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
120
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
121
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
122
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
123
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
124
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
125
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
126
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
127
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
128
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
129
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
130
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
131
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
132
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
133
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
134
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
135
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
136
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
137
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
138
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
139
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
140
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
141
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
142
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
143
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
144
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
145
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
146
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
147
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
148
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
149
  16777216
150
};
151

    
152
/* Input permutation for the simple_idct_mmx */
153
static const uint8_t simple_mmx_permutation[64]={
154
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
155
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
156
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
157
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
158
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
159
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
160
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
161
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
162
};
163

    
164
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
165

    
166
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
167
    int i;
168
    int end;
169

    
170
    st->scantable= src_scantable;
171

    
172
    for(i=0; i<64; i++){
173
        int j;
174
        j = src_scantable[i];
175
        st->permutated[i] = permutation[j];
176
#if ARCH_PPC
177
        st->inverse[j] = i;
178
#endif
179
    }
180

    
181
    end=-1;
182
    for(i=0; i<64; i++){
183
        int j;
184
        j = st->permutated[i];
185
        if(j>end) end=j;
186
        st->raster_end[i]= end;
187
    }
188
}
189

    
190
static int pix_sum_c(uint8_t * pix, int line_size)
191
{
192
    int s, i, j;
193

    
194
    s = 0;
195
    for (i = 0; i < 16; i++) {
196
        for (j = 0; j < 16; j += 8) {
197
            s += pix[0];
198
            s += pix[1];
199
            s += pix[2];
200
            s += pix[3];
201
            s += pix[4];
202
            s += pix[5];
203
            s += pix[6];
204
            s += pix[7];
205
            pix += 8;
206
        }
207
        pix += line_size - 16;
208
    }
209
    return s;
210
}
211

    
212
static int pix_norm1_c(uint8_t * pix, int line_size)
213
{
214
    int s, i, j;
215
    uint32_t *sq = ff_squareTbl + 256;
216

    
217
    s = 0;
218
    for (i = 0; i < 16; i++) {
219
        for (j = 0; j < 16; j += 8) {
220
#if 0
221
            s += sq[pix[0]];
222
            s += sq[pix[1]];
223
            s += sq[pix[2]];
224
            s += sq[pix[3]];
225
            s += sq[pix[4]];
226
            s += sq[pix[5]];
227
            s += sq[pix[6]];
228
            s += sq[pix[7]];
229
#else
230
#if LONG_MAX > 2147483647
231
            register uint64_t x=*(uint64_t*)pix;
232
            s += sq[x&0xff];
233
            s += sq[(x>>8)&0xff];
234
            s += sq[(x>>16)&0xff];
235
            s += sq[(x>>24)&0xff];
236
            s += sq[(x>>32)&0xff];
237
            s += sq[(x>>40)&0xff];
238
            s += sq[(x>>48)&0xff];
239
            s += sq[(x>>56)&0xff];
240
#else
241
            register uint32_t x=*(uint32_t*)pix;
242
            s += sq[x&0xff];
243
            s += sq[(x>>8)&0xff];
244
            s += sq[(x>>16)&0xff];
245
            s += sq[(x>>24)&0xff];
246
            x=*(uint32_t*)(pix+4);
247
            s += sq[x&0xff];
248
            s += sq[(x>>8)&0xff];
249
            s += sq[(x>>16)&0xff];
250
            s += sq[(x>>24)&0xff];
251
#endif
252
#endif
253
            pix += 8;
254
        }
255
        pix += line_size - 16;
256
    }
257
    return s;
258
}
259

    
260
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
261
    int i;
262

    
263
    for(i=0; i+8<=w; i+=8){
264
        dst[i+0]= bswap_32(src[i+0]);
265
        dst[i+1]= bswap_32(src[i+1]);
266
        dst[i+2]= bswap_32(src[i+2]);
267
        dst[i+3]= bswap_32(src[i+3]);
268
        dst[i+4]= bswap_32(src[i+4]);
269
        dst[i+5]= bswap_32(src[i+5]);
270
        dst[i+6]= bswap_32(src[i+6]);
271
        dst[i+7]= bswap_32(src[i+7]);
272
    }
273
    for(;i<w; i++){
274
        dst[i+0]= bswap_32(src[i+0]);
275
    }
276
}
277

    
278
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
279
{
280
    int s, i;
281
    uint32_t *sq = ff_squareTbl + 256;
282

    
283
    s = 0;
284
    for (i = 0; i < h; i++) {
285
        s += sq[pix1[0] - pix2[0]];
286
        s += sq[pix1[1] - pix2[1]];
287
        s += sq[pix1[2] - pix2[2]];
288
        s += sq[pix1[3] - pix2[3]];
289
        pix1 += line_size;
290
        pix2 += line_size;
291
    }
292
    return s;
293
}
294

    
295
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
296
{
297
    int s, i;
298
    uint32_t *sq = ff_squareTbl + 256;
299

    
300
    s = 0;
301
    for (i = 0; i < h; i++) {
302
        s += sq[pix1[0] - pix2[0]];
303
        s += sq[pix1[1] - pix2[1]];
304
        s += sq[pix1[2] - pix2[2]];
305
        s += sq[pix1[3] - pix2[3]];
306
        s += sq[pix1[4] - pix2[4]];
307
        s += sq[pix1[5] - pix2[5]];
308
        s += sq[pix1[6] - pix2[6]];
309
        s += sq[pix1[7] - pix2[7]];
310
        pix1 += line_size;
311
        pix2 += line_size;
312
    }
313
    return s;
314
}
315

    
316
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
317
{
318
    int s, i;
319
    uint32_t *sq = ff_squareTbl + 256;
320

    
321
    s = 0;
322
    for (i = 0; i < h; i++) {
323
        s += sq[pix1[ 0] - pix2[ 0]];
324
        s += sq[pix1[ 1] - pix2[ 1]];
325
        s += sq[pix1[ 2] - pix2[ 2]];
326
        s += sq[pix1[ 3] - pix2[ 3]];
327
        s += sq[pix1[ 4] - pix2[ 4]];
328
        s += sq[pix1[ 5] - pix2[ 5]];
329
        s += sq[pix1[ 6] - pix2[ 6]];
330
        s += sq[pix1[ 7] - pix2[ 7]];
331
        s += sq[pix1[ 8] - pix2[ 8]];
332
        s += sq[pix1[ 9] - pix2[ 9]];
333
        s += sq[pix1[10] - pix2[10]];
334
        s += sq[pix1[11] - pix2[11]];
335
        s += sq[pix1[12] - pix2[12]];
336
        s += sq[pix1[13] - pix2[13]];
337
        s += sq[pix1[14] - pix2[14]];
338
        s += sq[pix1[15] - pix2[15]];
339

    
340
        pix1 += line_size;
341
        pix2 += line_size;
342
    }
343
    return s;
344
}
345

    
346

    
347
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
348
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
349
    int s, i, j;
350
    const int dec_count= w==8 ? 3 : 4;
351
    int tmp[32*32];
352
    int level, ori;
353
    static const int scale[2][2][4][4]={
354
      {
355
        {
356
            // 9/7 8x8 dec=3
357
            {268, 239, 239, 213},
358
            {  0, 224, 224, 152},
359
            {  0, 135, 135, 110},
360
        },{
361
            // 9/7 16x16 or 32x32 dec=4
362
            {344, 310, 310, 280},
363
            {  0, 320, 320, 228},
364
            {  0, 175, 175, 136},
365
            {  0, 129, 129, 102},
366
        }
367
      },{
368
        {
369
            // 5/3 8x8 dec=3
370
            {275, 245, 245, 218},
371
            {  0, 230, 230, 156},
372
            {  0, 138, 138, 113},
373
        },{
374
            // 5/3 16x16 or 32x32 dec=4
375
            {352, 317, 317, 286},
376
            {  0, 328, 328, 233},
377
            {  0, 180, 180, 140},
378
            {  0, 132, 132, 105},
379
        }
380
      }
381
    };
382

    
383
    for (i = 0; i < h; i++) {
384
        for (j = 0; j < w; j+=4) {
385
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
386
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
387
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
388
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
389
        }
390
        pix1 += line_size;
391
        pix2 += line_size;
392
    }
393

    
394
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
395

    
396
    s=0;
397
    assert(w==h);
398
    for(level=0; level<dec_count; level++){
399
        for(ori= level ? 1 : 0; ori<4; ori++){
400
            int size= w>>(dec_count-level);
401
            int sx= (ori&1) ? size : 0;
402
            int stride= 32<<(dec_count-level);
403
            int sy= (ori&2) ? stride>>1 : 0;
404

    
405
            for(i=0; i<size; i++){
406
                for(j=0; j<size; j++){
407
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
408
                    s += FFABS(v);
409
                }
410
            }
411
        }
412
    }
413
    assert(s>=0);
414
    return s>>9;
415
}
416

    
417
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
419
}
420

    
421
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
423
}
424

    
425
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
427
}
428

    
429
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
431
}
432

    
433
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
435
}
436

    
437
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
438
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
439
}
440
#endif
441

    
442
/* draw the edges of width 'w' of an image of size width, height */
443
//FIXME check that this is ok for mpeg4 interlaced
444
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
445
{
446
    uint8_t *ptr, *last_line;
447
    int i;
448

    
449
    last_line = buf + (height - 1) * wrap;
450
    for(i=0;i<w;i++) {
451
        /* top and bottom */
452
        memcpy(buf - (i + 1) * wrap, buf, width);
453
        memcpy(last_line + (i + 1) * wrap, last_line, width);
454
    }
455
    /* left and right */
456
    ptr = buf;
457
    for(i=0;i<height;i++) {
458
        memset(ptr - w, ptr[0], w);
459
        memset(ptr + width, ptr[width-1], w);
460
        ptr += wrap;
461
    }
462
    /* corners */
463
    for(i=0;i<w;i++) {
464
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
465
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
466
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
467
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
468
    }
469
}
470

    
471
/**
472
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
473
 * @param buf destination buffer
474
 * @param src source buffer
475
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
476
 * @param block_w width of block
477
 * @param block_h height of block
478
 * @param src_x x coordinate of the top left sample of the block in the source buffer
479
 * @param src_y y coordinate of the top left sample of the block in the source buffer
480
 * @param w width of the source buffer
481
 * @param h height of the source buffer
482
 */
483
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
484
                                    int src_x, int src_y, int w, int h){
485
    int x, y;
486
    int start_y, start_x, end_y, end_x;
487

    
488
    if(src_y>= h){
489
        src+= (h-1-src_y)*linesize;
490
        src_y=h-1;
491
    }else if(src_y<=-block_h){
492
        src+= (1-block_h-src_y)*linesize;
493
        src_y=1-block_h;
494
    }
495
    if(src_x>= w){
496
        src+= (w-1-src_x);
497
        src_x=w-1;
498
    }else if(src_x<=-block_w){
499
        src+= (1-block_w-src_x);
500
        src_x=1-block_w;
501
    }
502

    
503
    start_y= FFMAX(0, -src_y);
504
    start_x= FFMAX(0, -src_x);
505
    end_y= FFMIN(block_h, h-src_y);
506
    end_x= FFMIN(block_w, w-src_x);
507

    
508
    // copy existing part
509
    for(y=start_y; y<end_y; y++){
510
        for(x=start_x; x<end_x; x++){
511
            buf[x + y*linesize]= src[x + y*linesize];
512
        }
513
    }
514

    
515
    //top
516
    for(y=0; y<start_y; y++){
517
        for(x=start_x; x<end_x; x++){
518
            buf[x + y*linesize]= buf[x + start_y*linesize];
519
        }
520
    }
521

    
522
    //bottom
523
    for(y=end_y; y<block_h; y++){
524
        for(x=start_x; x<end_x; x++){
525
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
526
        }
527
    }
528

    
529
    for(y=0; y<block_h; y++){
530
       //left
531
        for(x=0; x<start_x; x++){
532
            buf[x + y*linesize]= buf[start_x + y*linesize];
533
        }
534

    
535
       //right
536
        for(x=end_x; x<block_w; x++){
537
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
538
        }
539
    }
540
}
541

    
542
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
543
{
544
    int i;
545

    
546
    /* read the pixels */
547
    for(i=0;i<8;i++) {
548
        block[0] = pixels[0];
549
        block[1] = pixels[1];
550
        block[2] = pixels[2];
551
        block[3] = pixels[3];
552
        block[4] = pixels[4];
553
        block[5] = pixels[5];
554
        block[6] = pixels[6];
555
        block[7] = pixels[7];
556
        pixels += line_size;
557
        block += 8;
558
    }
559
}
560

    
561
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
562
                          const uint8_t *s2, int stride){
563
    int i;
564

    
565
    /* read the pixels */
566
    for(i=0;i<8;i++) {
567
        block[0] = s1[0] - s2[0];
568
        block[1] = s1[1] - s2[1];
569
        block[2] = s1[2] - s2[2];
570
        block[3] = s1[3] - s2[3];
571
        block[4] = s1[4] - s2[4];
572
        block[5] = s1[5] - s2[5];
573
        block[6] = s1[6] - s2[6];
574
        block[7] = s1[7] - s2[7];
575
        s1 += stride;
576
        s2 += stride;
577
        block += 8;
578
    }
579
}
580

    
581

    
582
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
583
                                 int line_size)
584
{
585
    int i;
586
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
587

    
588
    /* read the pixels */
589
    for(i=0;i<8;i++) {
590
        pixels[0] = cm[block[0]];
591
        pixels[1] = cm[block[1]];
592
        pixels[2] = cm[block[2]];
593
        pixels[3] = cm[block[3]];
594
        pixels[4] = cm[block[4]];
595
        pixels[5] = cm[block[5]];
596
        pixels[6] = cm[block[6]];
597
        pixels[7] = cm[block[7]];
598

    
599
        pixels += line_size;
600
        block += 8;
601
    }
602
}
603

    
604
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
605
                                 int line_size)
606
{
607
    int i;
608
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
609

    
610
    /* read the pixels */
611
    for(i=0;i<4;i++) {
612
        pixels[0] = cm[block[0]];
613
        pixels[1] = cm[block[1]];
614
        pixels[2] = cm[block[2]];
615
        pixels[3] = cm[block[3]];
616

    
617
        pixels += line_size;
618
        block += 8;
619
    }
620
}
621

    
622
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
623
                                 int line_size)
624
{
625
    int i;
626
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
627

    
628
    /* read the pixels */
629
    for(i=0;i<2;i++) {
630
        pixels[0] = cm[block[0]];
631
        pixels[1] = cm[block[1]];
632

    
633
        pixels += line_size;
634
        block += 8;
635
    }
636
}
637

    
638
static void put_signed_pixels_clamped_c(const DCTELEM *block,
639
                                        uint8_t *restrict pixels,
640
                                        int line_size)
641
{
642
    int i, j;
643

    
644
    for (i = 0; i < 8; i++) {
645
        for (j = 0; j < 8; j++) {
646
            if (*block < -128)
647
                *pixels = 0;
648
            else if (*block > 127)
649
                *pixels = 255;
650
            else
651
                *pixels = (uint8_t)(*block + 128);
652
            block++;
653
            pixels++;
654
        }
655
        pixels += (line_size - 8);
656
    }
657
}
658

    
659
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
660
                          int line_size)
661
{
662
    int i;
663
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
664

    
665
    /* read the pixels */
666
    for(i=0;i<8;i++) {
667
        pixels[0] = cm[pixels[0] + block[0]];
668
        pixels[1] = cm[pixels[1] + block[1]];
669
        pixels[2] = cm[pixels[2] + block[2]];
670
        pixels[3] = cm[pixels[3] + block[3]];
671
        pixels[4] = cm[pixels[4] + block[4]];
672
        pixels[5] = cm[pixels[5] + block[5]];
673
        pixels[6] = cm[pixels[6] + block[6]];
674
        pixels[7] = cm[pixels[7] + block[7]];
675
        pixels += line_size;
676
        block += 8;
677
    }
678
}
679

    
680
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
681
                          int line_size)
682
{
683
    int i;
684
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
685

    
686
    /* read the pixels */
687
    for(i=0;i<4;i++) {
688
        pixels[0] = cm[pixels[0] + block[0]];
689
        pixels[1] = cm[pixels[1] + block[1]];
690
        pixels[2] = cm[pixels[2] + block[2]];
691
        pixels[3] = cm[pixels[3] + block[3]];
692
        pixels += line_size;
693
        block += 8;
694
    }
695
}
696

    
697
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
698
                          int line_size)
699
{
700
    int i;
701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
702

    
703
    /* read the pixels */
704
    for(i=0;i<2;i++) {
705
        pixels[0] = cm[pixels[0] + block[0]];
706
        pixels[1] = cm[pixels[1] + block[1]];
707
        pixels += line_size;
708
        block += 8;
709
    }
710
}
711

    
712
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
713
{
714
    int i;
715
    for(i=0;i<8;i++) {
716
        pixels[0] += block[0];
717
        pixels[1] += block[1];
718
        pixels[2] += block[2];
719
        pixels[3] += block[3];
720
        pixels[4] += block[4];
721
        pixels[5] += block[5];
722
        pixels[6] += block[6];
723
        pixels[7] += block[7];
724
        pixels += line_size;
725
        block += 8;
726
    }
727
}
728

    
729
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
730
{
731
    int i;
732
    for(i=0;i<4;i++) {
733
        pixels[0] += block[0];
734
        pixels[1] += block[1];
735
        pixels[2] += block[2];
736
        pixels[3] += block[3];
737
        pixels += line_size;
738
        block += 4;
739
    }
740
}
741

    
742
static int sum_abs_dctelem_c(DCTELEM *block)
743
{
744
    int sum=0, i;
745
    for(i=0; i<64; i++)
746
        sum+= FFABS(block[i]);
747
    return sum;
748
}
749

    
750
#if 0
751

752
#define PIXOP2(OPNAME, OP) \
753
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
754
{\
755
    int i;\
756
    for(i=0; i<h; i++){\
757
        OP(*((uint64_t*)block), AV_RN64(pixels));\
758
        pixels+=line_size;\
759
        block +=line_size;\
760
    }\
761
}\
762
\
763
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
764
{\
765
    int i;\
766
    for(i=0; i<h; i++){\
767
        const uint64_t a= AV_RN64(pixels  );\
768
        const uint64_t b= AV_RN64(pixels+1);\
769
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
770
        pixels+=line_size;\
771
        block +=line_size;\
772
    }\
773
}\
774
\
775
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
776
{\
777
    int i;\
778
    for(i=0; i<h; i++){\
779
        const uint64_t a= AV_RN64(pixels  );\
780
        const uint64_t b= AV_RN64(pixels+1);\
781
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
782
        pixels+=line_size;\
783
        block +=line_size;\
784
    }\
785
}\
786
\
787
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
788
{\
789
    int i;\
790
    for(i=0; i<h; i++){\
791
        const uint64_t a= AV_RN64(pixels          );\
792
        const uint64_t b= AV_RN64(pixels+line_size);\
793
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
794
        pixels+=line_size;\
795
        block +=line_size;\
796
    }\
797
}\
798
\
799
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
800
{\
801
    int i;\
802
    for(i=0; i<h; i++){\
803
        const uint64_t a= AV_RN64(pixels          );\
804
        const uint64_t b= AV_RN64(pixels+line_size);\
805
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
806
        pixels+=line_size;\
807
        block +=line_size;\
808
    }\
809
}\
810
\
811
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
812
{\
813
        int i;\
814
        const uint64_t a= AV_RN64(pixels  );\
815
        const uint64_t b= AV_RN64(pixels+1);\
816
        uint64_t l0=  (a&0x0303030303030303ULL)\
817
                    + (b&0x0303030303030303ULL)\
818
                    + 0x0202020202020202ULL;\
819
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
821
        uint64_t l1,h1;\
822
\
823
        pixels+=line_size;\
824
        for(i=0; i<h; i+=2){\
825
            uint64_t a= AV_RN64(pixels  );\
826
            uint64_t b= AV_RN64(pixels+1);\
827
            l1=  (a&0x0303030303030303ULL)\
828
               + (b&0x0303030303030303ULL);\
829
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
832
            pixels+=line_size;\
833
            block +=line_size;\
834
            a= AV_RN64(pixels  );\
835
            b= AV_RN64(pixels+1);\
836
            l0=  (a&0x0303030303030303ULL)\
837
               + (b&0x0303030303030303ULL)\
838
               + 0x0202020202020202ULL;\
839
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
840
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
841
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
842
            pixels+=line_size;\
843
            block +=line_size;\
844
        }\
845
}\
846
\
847
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
848
{\
849
        int i;\
850
        const uint64_t a= AV_RN64(pixels  );\
851
        const uint64_t b= AV_RN64(pixels+1);\
852
        uint64_t l0=  (a&0x0303030303030303ULL)\
853
                    + (b&0x0303030303030303ULL)\
854
                    + 0x0101010101010101ULL;\
855
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
857
        uint64_t l1,h1;\
858
\
859
        pixels+=line_size;\
860
        for(i=0; i<h; i+=2){\
861
            uint64_t a= AV_RN64(pixels  );\
862
            uint64_t b= AV_RN64(pixels+1);\
863
            l1=  (a&0x0303030303030303ULL)\
864
               + (b&0x0303030303030303ULL);\
865
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
868
            pixels+=line_size;\
869
            block +=line_size;\
870
            a= AV_RN64(pixels  );\
871
            b= AV_RN64(pixels+1);\
872
            l0=  (a&0x0303030303030303ULL)\
873
               + (b&0x0303030303030303ULL)\
874
               + 0x0101010101010101ULL;\
875
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
876
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
877
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
878
            pixels+=line_size;\
879
            block +=line_size;\
880
        }\
881
}\
882
\
883
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
884
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
886
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
887
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
888
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
889
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
890

891
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
892
#else // 64 bit variant
893

    
894
#define PIXOP2(OPNAME, OP) \
895
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896
    int i;\
897
    for(i=0; i<h; i++){\
898
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
899
        pixels+=line_size;\
900
        block +=line_size;\
901
    }\
902
}\
903
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904
    int i;\
905
    for(i=0; i<h; i++){\
906
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
907
        pixels+=line_size;\
908
        block +=line_size;\
909
    }\
910
}\
911
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
912
    int i;\
913
    for(i=0; i<h; i++){\
914
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
915
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
916
        pixels+=line_size;\
917
        block +=line_size;\
918
    }\
919
}\
920
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
922
}\
923
\
924
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
925
                                                int src_stride1, int src_stride2, int h){\
926
    int i;\
927
    for(i=0; i<h; i++){\
928
        uint32_t a,b;\
929
        a= AV_RN32(&src1[i*src_stride1  ]);\
930
        b= AV_RN32(&src2[i*src_stride2  ]);\
931
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
932
        a= AV_RN32(&src1[i*src_stride1+4]);\
933
        b= AV_RN32(&src2[i*src_stride2+4]);\
934
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
935
    }\
936
}\
937
\
938
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
939
                                                int src_stride1, int src_stride2, int h){\
940
    int i;\
941
    for(i=0; i<h; i++){\
942
        uint32_t a,b;\
943
        a= AV_RN32(&src1[i*src_stride1  ]);\
944
        b= AV_RN32(&src2[i*src_stride2  ]);\
945
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
946
        a= AV_RN32(&src1[i*src_stride1+4]);\
947
        b= AV_RN32(&src2[i*src_stride2+4]);\
948
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
949
    }\
950
}\
951
\
952
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
953
                                                int src_stride1, int src_stride2, int h){\
954
    int i;\
955
    for(i=0; i<h; i++){\
956
        uint32_t a,b;\
957
        a= AV_RN32(&src1[i*src_stride1  ]);\
958
        b= AV_RN32(&src2[i*src_stride2  ]);\
959
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
960
    }\
961
}\
962
\
963
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
964
                                                int src_stride1, int src_stride2, int h){\
965
    int i;\
966
    for(i=0; i<h; i++){\
967
        uint32_t a,b;\
968
        a= AV_RN16(&src1[i*src_stride1  ]);\
969
        b= AV_RN16(&src2[i*src_stride2  ]);\
970
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
971
    }\
972
}\
973
\
974
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
975
                                                int src_stride1, int src_stride2, int h){\
976
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
977
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
978
}\
979
\
980
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
981
                                                int src_stride1, int src_stride2, int h){\
982
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
983
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
984
}\
985
\
986
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
988
}\
989
\
990
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
992
}\
993
\
994
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
996
}\
997
\
998
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
999
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1000
}\
1001
\
1002
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1003
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1004
    int i;\
1005
    for(i=0; i<h; i++){\
1006
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1007
        a= AV_RN32(&src1[i*src_stride1]);\
1008
        b= AV_RN32(&src2[i*src_stride2]);\
1009
        c= AV_RN32(&src3[i*src_stride3]);\
1010
        d= AV_RN32(&src4[i*src_stride4]);\
1011
        l0=  (a&0x03030303UL)\
1012
           + (b&0x03030303UL)\
1013
           + 0x02020202UL;\
1014
        h0= ((a&0xFCFCFCFCUL)>>2)\
1015
          + ((b&0xFCFCFCFCUL)>>2);\
1016
        l1=  (c&0x03030303UL)\
1017
           + (d&0x03030303UL);\
1018
        h1= ((c&0xFCFCFCFCUL)>>2)\
1019
          + ((d&0xFCFCFCFCUL)>>2);\
1020
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021
        a= AV_RN32(&src1[i*src_stride1+4]);\
1022
        b= AV_RN32(&src2[i*src_stride2+4]);\
1023
        c= AV_RN32(&src3[i*src_stride3+4]);\
1024
        d= AV_RN32(&src4[i*src_stride4+4]);\
1025
        l0=  (a&0x03030303UL)\
1026
           + (b&0x03030303UL)\
1027
           + 0x02020202UL;\
1028
        h0= ((a&0xFCFCFCFCUL)>>2)\
1029
          + ((b&0xFCFCFCFCUL)>>2);\
1030
        l1=  (c&0x03030303UL)\
1031
           + (d&0x03030303UL);\
1032
        h1= ((c&0xFCFCFCFCUL)>>2)\
1033
          + ((d&0xFCFCFCFCUL)>>2);\
1034
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035
    }\
1036
}\
1037
\
1038
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1040
}\
1041
\
1042
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1044
}\
1045
\
1046
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1048
}\
1049
\
1050
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1051
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1052
}\
1053
\
1054
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1055
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1056
    int i;\
1057
    for(i=0; i<h; i++){\
1058
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1059
        a= AV_RN32(&src1[i*src_stride1]);\
1060
        b= AV_RN32(&src2[i*src_stride2]);\
1061
        c= AV_RN32(&src3[i*src_stride3]);\
1062
        d= AV_RN32(&src4[i*src_stride4]);\
1063
        l0=  (a&0x03030303UL)\
1064
           + (b&0x03030303UL)\
1065
           + 0x01010101UL;\
1066
        h0= ((a&0xFCFCFCFCUL)>>2)\
1067
          + ((b&0xFCFCFCFCUL)>>2);\
1068
        l1=  (c&0x03030303UL)\
1069
           + (d&0x03030303UL);\
1070
        h1= ((c&0xFCFCFCFCUL)>>2)\
1071
          + ((d&0xFCFCFCFCUL)>>2);\
1072
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073
        a= AV_RN32(&src1[i*src_stride1+4]);\
1074
        b= AV_RN32(&src2[i*src_stride2+4]);\
1075
        c= AV_RN32(&src3[i*src_stride3+4]);\
1076
        d= AV_RN32(&src4[i*src_stride4+4]);\
1077
        l0=  (a&0x03030303UL)\
1078
           + (b&0x03030303UL)\
1079
           + 0x01010101UL;\
1080
        h0= ((a&0xFCFCFCFCUL)>>2)\
1081
          + ((b&0xFCFCFCFCUL)>>2);\
1082
        l1=  (c&0x03030303UL)\
1083
           + (d&0x03030303UL);\
1084
        h1= ((c&0xFCFCFCFCUL)>>2)\
1085
          + ((d&0xFCFCFCFCUL)>>2);\
1086
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087
    }\
1088
}\
1089
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1090
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1091
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093
}\
1094
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1095
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1096
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1098
}\
1099
\
1100
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1101
{\
1102
        int i, a0, b0, a1, b1;\
1103
        a0= pixels[0];\
1104
        b0= pixels[1] + 2;\
1105
        a0 += b0;\
1106
        b0 += pixels[2];\
1107
\
1108
        pixels+=line_size;\
1109
        for(i=0; i<h; i+=2){\
1110
            a1= pixels[0];\
1111
            b1= pixels[1];\
1112
            a1 += b1;\
1113
            b1 += pixels[2];\
1114
\
1115
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1116
            block[1]= (b1+b0)>>2;\
1117
\
1118
            pixels+=line_size;\
1119
            block +=line_size;\
1120
\
1121
            a0= pixels[0];\
1122
            b0= pixels[1] + 2;\
1123
            a0 += b0;\
1124
            b0 += pixels[2];\
1125
\
1126
            block[0]= (a1+a0)>>2;\
1127
            block[1]= (b1+b0)>>2;\
1128
            pixels+=line_size;\
1129
            block +=line_size;\
1130
        }\
1131
}\
1132
\
1133
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1134
{\
1135
        int i;\
1136
        const uint32_t a= AV_RN32(pixels  );\
1137
        const uint32_t b= AV_RN32(pixels+1);\
1138
        uint32_t l0=  (a&0x03030303UL)\
1139
                    + (b&0x03030303UL)\
1140
                    + 0x02020202UL;\
1141
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142
                   + ((b&0xFCFCFCFCUL)>>2);\
1143
        uint32_t l1,h1;\
1144
\
1145
        pixels+=line_size;\
1146
        for(i=0; i<h; i+=2){\
1147
            uint32_t a= AV_RN32(pixels  );\
1148
            uint32_t b= AV_RN32(pixels+1);\
1149
            l1=  (a&0x03030303UL)\
1150
               + (b&0x03030303UL);\
1151
            h1= ((a&0xFCFCFCFCUL)>>2)\
1152
              + ((b&0xFCFCFCFCUL)>>2);\
1153
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154
            pixels+=line_size;\
1155
            block +=line_size;\
1156
            a= AV_RN32(pixels  );\
1157
            b= AV_RN32(pixels+1);\
1158
            l0=  (a&0x03030303UL)\
1159
               + (b&0x03030303UL)\
1160
               + 0x02020202UL;\
1161
            h0= ((a&0xFCFCFCFCUL)>>2)\
1162
              + ((b&0xFCFCFCFCUL)>>2);\
1163
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164
            pixels+=line_size;\
1165
            block +=line_size;\
1166
        }\
1167
}\
1168
\
1169
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1170
{\
1171
    int j;\
1172
    for(j=0; j<2; j++){\
1173
        int i;\
1174
        const uint32_t a= AV_RN32(pixels  );\
1175
        const uint32_t b= AV_RN32(pixels+1);\
1176
        uint32_t l0=  (a&0x03030303UL)\
1177
                    + (b&0x03030303UL)\
1178
                    + 0x02020202UL;\
1179
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1180
                   + ((b&0xFCFCFCFCUL)>>2);\
1181
        uint32_t l1,h1;\
1182
\
1183
        pixels+=line_size;\
1184
        for(i=0; i<h; i+=2){\
1185
            uint32_t a= AV_RN32(pixels  );\
1186
            uint32_t b= AV_RN32(pixels+1);\
1187
            l1=  (a&0x03030303UL)\
1188
               + (b&0x03030303UL);\
1189
            h1= ((a&0xFCFCFCFCUL)>>2)\
1190
              + ((b&0xFCFCFCFCUL)>>2);\
1191
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1192
            pixels+=line_size;\
1193
            block +=line_size;\
1194
            a= AV_RN32(pixels  );\
1195
            b= AV_RN32(pixels+1);\
1196
            l0=  (a&0x03030303UL)\
1197
               + (b&0x03030303UL)\
1198
               + 0x02020202UL;\
1199
            h0= ((a&0xFCFCFCFCUL)>>2)\
1200
              + ((b&0xFCFCFCFCUL)>>2);\
1201
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1202
            pixels+=line_size;\
1203
            block +=line_size;\
1204
        }\
1205
        pixels+=4-line_size*(h+1);\
1206
        block +=4-line_size*h;\
1207
    }\
1208
}\
1209
\
1210
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1211
{\
1212
    int j;\
1213
    for(j=0; j<2; j++){\
1214
        int i;\
1215
        const uint32_t a= AV_RN32(pixels  );\
1216
        const uint32_t b= AV_RN32(pixels+1);\
1217
        uint32_t l0=  (a&0x03030303UL)\
1218
                    + (b&0x03030303UL)\
1219
                    + 0x01010101UL;\
1220
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1221
                   + ((b&0xFCFCFCFCUL)>>2);\
1222
        uint32_t l1,h1;\
1223
\
1224
        pixels+=line_size;\
1225
        for(i=0; i<h; i+=2){\
1226
            uint32_t a= AV_RN32(pixels  );\
1227
            uint32_t b= AV_RN32(pixels+1);\
1228
            l1=  (a&0x03030303UL)\
1229
               + (b&0x03030303UL);\
1230
            h1= ((a&0xFCFCFCFCUL)>>2)\
1231
              + ((b&0xFCFCFCFCUL)>>2);\
1232
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1233
            pixels+=line_size;\
1234
            block +=line_size;\
1235
            a= AV_RN32(pixels  );\
1236
            b= AV_RN32(pixels+1);\
1237
            l0=  (a&0x03030303UL)\
1238
               + (b&0x03030303UL)\
1239
               + 0x01010101UL;\
1240
            h0= ((a&0xFCFCFCFCUL)>>2)\
1241
              + ((b&0xFCFCFCFCUL)>>2);\
1242
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1243
            pixels+=line_size;\
1244
            block +=line_size;\
1245
        }\
1246
        pixels+=4-line_size*(h+1);\
1247
        block +=4-line_size*h;\
1248
    }\
1249
}\
1250
\
1251
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1252
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1255
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1256
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1257
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1258
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1259

    
1260
#define op_avg(a, b) a = rnd_avg32(a, b)
1261
#endif
1262
#define op_put(a, b) a = b
1263

    
1264
PIXOP2(avg, op_avg)
1265
PIXOP2(put, op_put)
1266
#undef op_avg
1267
#undef op_put
1268

    
1269
#define avg2(a,b) ((a+b+1)>>1)
1270
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1271

    
1272
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1274
}
1275

    
1276
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1277
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1278
}
1279

    
1280
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1281
{
1282
    const int A=(16-x16)*(16-y16);
1283
    const int B=(   x16)*(16-y16);
1284
    const int C=(16-x16)*(   y16);
1285
    const int D=(   x16)*(   y16);
1286
    int i;
1287

    
1288
    for(i=0; i<h; i++)
1289
    {
1290
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1291
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1292
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1293
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1294
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1295
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1296
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1297
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1298
        dst+= stride;
1299
        src+= stride;
1300
    }
1301
}
1302

    
1303
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1304
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1305
{
1306
    int y, vx, vy;
1307
    const int s= 1<<shift;
1308

    
1309
    width--;
1310
    height--;
1311

    
1312
    for(y=0; y<h; y++){
1313
        int x;
1314

    
1315
        vx= ox;
1316
        vy= oy;
1317
        for(x=0; x<8; x++){ //XXX FIXME optimize
1318
            int src_x, src_y, frac_x, frac_y, index;
1319

    
1320
            src_x= vx>>16;
1321
            src_y= vy>>16;
1322
            frac_x= src_x&(s-1);
1323
            frac_y= src_y&(s-1);
1324
            src_x>>=shift;
1325
            src_y>>=shift;
1326

    
1327
            if((unsigned)src_x < width){
1328
                if((unsigned)src_y < height){
1329
                    index= src_x + src_y*stride;
1330
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1331
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1332
                                        + (  src[index+stride  ]*(s-frac_x)
1333
                                           + src[index+stride+1]*   frac_x )*   frac_y
1334
                                        + r)>>(shift*2);
1335
                }else{
1336
                    index= src_x + av_clip(src_y, 0, height)*stride;
1337
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1338
                                          + src[index       +1]*   frac_x )*s
1339
                                        + r)>>(shift*2);
1340
                }
1341
            }else{
1342
                if((unsigned)src_y < height){
1343
                    index= av_clip(src_x, 0, width) + src_y*stride;
1344
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1345
                                           + src[index+stride  ]*   frac_y )*s
1346
                                        + r)>>(shift*2);
1347
                }else{
1348
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1349
                    dst[y*stride + x]=    src[index         ];
1350
                }
1351
            }
1352

    
1353
            vx+= dxx;
1354
            vy+= dyx;
1355
        }
1356
        ox += dxy;
1357
        oy += dyy;
1358
    }
1359
}
1360

    
1361
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362
    switch(width){
1363
    case 2: put_pixels2_c (dst, src, stride, height); break;
1364
    case 4: put_pixels4_c (dst, src, stride, height); break;
1365
    case 8: put_pixels8_c (dst, src, stride, height); break;
1366
    case 16:put_pixels16_c(dst, src, stride, height); break;
1367
    }
1368
}
1369

    
1370
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371
    int i,j;
1372
    for (i=0; i < height; i++) {
1373
      for (j=0; j < width; j++) {
1374
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1375
      }
1376
      src += stride;
1377
      dst += stride;
1378
    }
1379
}
1380

    
1381
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1382
    int i,j;
1383
    for (i=0; i < height; i++) {
1384
      for (j=0; j < width; j++) {
1385
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1386
      }
1387
      src += stride;
1388
      dst += stride;
1389
    }
1390
}
1391

    
1392
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393
    int i,j;
1394
    for (i=0; i < height; i++) {
1395
      for (j=0; j < width; j++) {
1396
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1397
      }
1398
      src += stride;
1399
      dst += stride;
1400
    }
1401
}
1402

    
1403
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404
    int i,j;
1405
    for (i=0; i < height; i++) {
1406
      for (j=0; j < width; j++) {
1407
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1408
      }
1409
      src += stride;
1410
      dst += stride;
1411
    }
1412
}
1413

    
1414
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415
    int i,j;
1416
    for (i=0; i < height; i++) {
1417
      for (j=0; j < width; j++) {
1418
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1419
      }
1420
      src += stride;
1421
      dst += stride;
1422
    }
1423
}
1424

    
1425
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426
    int i,j;
1427
    for (i=0; i < height; i++) {
1428
      for (j=0; j < width; j++) {
1429
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1430
      }
1431
      src += stride;
1432
      dst += stride;
1433
    }
1434
}
1435

    
1436
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437
    int i,j;
1438
    for (i=0; i < height; i++) {
1439
      for (j=0; j < width; j++) {
1440
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1441
      }
1442
      src += stride;
1443
      dst += stride;
1444
    }
1445
}
1446

    
1447
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448
    int i,j;
1449
    for (i=0; i < height; i++) {
1450
      for (j=0; j < width; j++) {
1451
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1452
      }
1453
      src += stride;
1454
      dst += stride;
1455
    }
1456
}
1457

    
1458
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459
    switch(width){
1460
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1461
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1462
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1463
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1464
    }
1465
}
1466

    
1467
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1468
    int i,j;
1469
    for (i=0; i < height; i++) {
1470
      for (j=0; j < width; j++) {
1471
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1472
      }
1473
      src += stride;
1474
      dst += stride;
1475
    }
1476
}
1477

    
1478
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1479
    int i,j;
1480
    for (i=0; i < height; i++) {
1481
      for (j=0; j < width; j++) {
1482
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1483
      }
1484
      src += stride;
1485
      dst += stride;
1486
    }
1487
}
1488

    
1489
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1490
    int i,j;
1491
    for (i=0; i < height; i++) {
1492
      for (j=0; j < width; j++) {
1493
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1494
      }
1495
      src += stride;
1496
      dst += stride;
1497
    }
1498
}
1499

    
1500
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1501
    int i,j;
1502
    for (i=0; i < height; i++) {
1503
      for (j=0; j < width; j++) {
1504
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1505
      }
1506
      src += stride;
1507
      dst += stride;
1508
    }
1509
}
1510

    
1511
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1512
    int i,j;
1513
    for (i=0; i < height; i++) {
1514
      for (j=0; j < width; j++) {
1515
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1516
      }
1517
      src += stride;
1518
      dst += stride;
1519
    }
1520
}
1521

    
1522
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1523
    int i,j;
1524
    for (i=0; i < height; i++) {
1525
      for (j=0; j < width; j++) {
1526
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1527
      }
1528
      src += stride;
1529
      dst += stride;
1530
    }
1531
}
1532

    
1533
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1534
    int i,j;
1535
    for (i=0; i < height; i++) {
1536
      for (j=0; j < width; j++) {
1537
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1538
      }
1539
      src += stride;
1540
      dst += stride;
1541
    }
1542
}
1543

    
1544
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1545
    int i,j;
1546
    for (i=0; i < height; i++) {
1547
      for (j=0; j < width; j++) {
1548
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1549
      }
1550
      src += stride;
1551
      dst += stride;
1552
    }
1553
}
1554
#if 0
1555
#define TPEL_WIDTH(width)\
1556
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1558
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1560
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1562
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1564
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1566
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1568
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1570
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1571
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1572
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1573
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1574
#endif
1575

    
1576
#define H264_CHROMA_MC(OPNAME, OP)\
1577
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1578
    const int A=(8-x)*(8-y);\
1579
    const int B=(  x)*(8-y);\
1580
    const int C=(8-x)*(  y);\
1581
    const int D=(  x)*(  y);\
1582
    int i;\
1583
    \
1584
    assert(x<8 && y<8 && x>=0 && y>=0);\
1585
\
1586
    if(D){\
1587
        for(i=0; i<h; i++){\
1588
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1589
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1590
            dst+= stride;\
1591
            src+= stride;\
1592
        }\
1593
    }else{\
1594
        const int E= B+C;\
1595
        const int step= C ? stride : 1;\
1596
        for(i=0; i<h; i++){\
1597
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1598
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1599
            dst+= stride;\
1600
            src+= stride;\
1601
        }\
1602
    }\
1603
}\
1604
\
1605
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1606
    const int A=(8-x)*(8-y);\
1607
    const int B=(  x)*(8-y);\
1608
    const int C=(8-x)*(  y);\
1609
    const int D=(  x)*(  y);\
1610
    int i;\
1611
    \
1612
    assert(x<8 && y<8 && x>=0 && y>=0);\
1613
\
1614
    if(D){\
1615
        for(i=0; i<h; i++){\
1616
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1617
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1618
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1619
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1620
            dst+= stride;\
1621
            src+= stride;\
1622
        }\
1623
    }else{\
1624
        const int E= B+C;\
1625
        const int step= C ? stride : 1;\
1626
        for(i=0; i<h; i++){\
1627
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1628
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1629
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1630
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1631
            dst+= stride;\
1632
            src+= stride;\
1633
        }\
1634
    }\
1635
}\
1636
\
1637
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1638
    const int A=(8-x)*(8-y);\
1639
    const int B=(  x)*(8-y);\
1640
    const int C=(8-x)*(  y);\
1641
    const int D=(  x)*(  y);\
1642
    int i;\
1643
    \
1644
    assert(x<8 && y<8 && x>=0 && y>=0);\
1645
\
1646
    if(D){\
1647
        for(i=0; i<h; i++){\
1648
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1649
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1650
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1651
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1652
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1653
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1654
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1655
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1656
            dst+= stride;\
1657
            src+= stride;\
1658
        }\
1659
    }else{\
1660
        const int E= B+C;\
1661
        const int step= C ? stride : 1;\
1662
        for(i=0; i<h; i++){\
1663
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1664
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1665
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1666
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1667
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1668
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1669
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1670
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1671
            dst+= stride;\
1672
            src+= stride;\
1673
        }\
1674
    }\
1675
}
1676

    
1677
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1678
#define op_put(a, b) a = (((b) + 32)>>6)
1679

    
1680
H264_CHROMA_MC(put_       , op_put)
1681
H264_CHROMA_MC(avg_       , op_avg)
1682
#undef op_avg
1683
#undef op_put
1684

    
1685
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1686
    const int A=(8-x)*(8-y);
1687
    const int B=(  x)*(8-y);
1688
    const int C=(8-x)*(  y);
1689
    const int D=(  x)*(  y);
1690
    int i;
1691

    
1692
    assert(x<8 && y<8 && x>=0 && y>=0);
1693

    
1694
    for(i=0; i<h; i++)
1695
    {
1696
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1697
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1698
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1699
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1700
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1701
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1702
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1703
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1704
        dst+= stride;
1705
        src+= stride;
1706
    }
1707
}
1708

    
1709
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1710
    const int A=(8-x)*(8-y);
1711
    const int B=(  x)*(8-y);
1712
    const int C=(8-x)*(  y);
1713
    const int D=(  x)*(  y);
1714
    int i;
1715

    
1716
    assert(x<8 && y<8 && x>=0 && y>=0);
1717

    
1718
    for(i=0; i<h; i++)
1719
    {
1720
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1721
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1722
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1723
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1724
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1725
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1726
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1727
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1728
        dst+= stride;
1729
        src+= stride;
1730
    }
1731
}
1732

    
1733
#define QPEL_MC(r, OPNAME, RND, OP) \
1734
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1735
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1736
    int i;\
1737
    for(i=0; i<h; i++)\
1738
    {\
1739
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1740
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1741
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1742
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1743
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1744
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1745
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1746
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1747
        dst+=dstStride;\
1748
        src+=srcStride;\
1749
    }\
1750
}\
1751
\
1752
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1753
    const int w=8;\
1754
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755
    int i;\
1756
    for(i=0; i<w; i++)\
1757
    {\
1758
        const int src0= src[0*srcStride];\
1759
        const int src1= src[1*srcStride];\
1760
        const int src2= src[2*srcStride];\
1761
        const int src3= src[3*srcStride];\
1762
        const int src4= src[4*srcStride];\
1763
        const int src5= src[5*srcStride];\
1764
        const int src6= src[6*srcStride];\
1765
        const int src7= src[7*srcStride];\
1766
        const int src8= src[8*srcStride];\
1767
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1768
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1769
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1770
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1771
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1772
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1773
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1774
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1775
        dst++;\
1776
        src++;\
1777
    }\
1778
}\
1779
\
1780
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1781
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782
    int i;\
1783
    \
1784
    for(i=0; i<h; i++)\
1785
    {\
1786
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1787
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1788
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1789
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1790
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1791
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1792
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1793
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1794
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1795
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1796
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1797
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1798
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1799
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1800
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1801
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1802
        dst+=dstStride;\
1803
        src+=srcStride;\
1804
    }\
1805
}\
1806
\
1807
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1808
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1809
    int i;\
1810
    const int w=16;\
1811
    for(i=0; i<w; i++)\
1812
    {\
1813
        const int src0= src[0*srcStride];\
1814
        const int src1= src[1*srcStride];\
1815
        const int src2= src[2*srcStride];\
1816
        const int src3= src[3*srcStride];\
1817
        const int src4= src[4*srcStride];\
1818
        const int src5= src[5*srcStride];\
1819
        const int src6= src[6*srcStride];\
1820
        const int src7= src[7*srcStride];\
1821
        const int src8= src[8*srcStride];\
1822
        const int src9= src[9*srcStride];\
1823
        const int src10= src[10*srcStride];\
1824
        const int src11= src[11*srcStride];\
1825
        const int src12= src[12*srcStride];\
1826
        const int src13= src[13*srcStride];\
1827
        const int src14= src[14*srcStride];\
1828
        const int src15= src[15*srcStride];\
1829
        const int src16= src[16*srcStride];\
1830
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1831
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1832
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1833
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1834
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1835
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1836
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1837
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1838
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1839
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1840
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1841
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1842
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1843
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1844
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1845
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1846
        dst++;\
1847
        src++;\
1848
    }\
1849
}\
1850
\
1851
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1852
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1853
}\
1854
\
1855
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1856
    uint8_t half[64];\
1857
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1858
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1859
}\
1860
\
1861
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1862
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1863
}\
1864
\
1865
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1866
    uint8_t half[64];\
1867
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1868
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1869
}\
1870
\
1871
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1872
    uint8_t full[16*9];\
1873
    uint8_t half[64];\
1874
    copy_block9(full, src, 16, stride, 9);\
1875
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1876
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1877
}\
1878
\
1879
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1880
    uint8_t full[16*9];\
1881
    copy_block9(full, src, 16, stride, 9);\
1882
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1883
}\
1884
\
1885
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1886
    uint8_t full[16*9];\
1887
    uint8_t half[64];\
1888
    copy_block9(full, src, 16, stride, 9);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1890
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1891
}\
1892
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[16*9];\
1894
    uint8_t halfH[72];\
1895
    uint8_t halfV[64];\
1896
    uint8_t halfHV[64];\
1897
    copy_block9(full, src, 16, stride, 9);\
1898
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1900
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1902
}\
1903
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1904
    uint8_t full[16*9];\
1905
    uint8_t halfH[72];\
1906
    uint8_t halfHV[64];\
1907
    copy_block9(full, src, 16, stride, 9);\
1908
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1909
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1910
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1912
}\
1913
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[16*9];\
1915
    uint8_t halfH[72];\
1916
    uint8_t halfV[64];\
1917
    uint8_t halfHV[64];\
1918
    copy_block9(full, src, 16, stride, 9);\
1919
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1921
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1923
}\
1924
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1925
    uint8_t full[16*9];\
1926
    uint8_t halfH[72];\
1927
    uint8_t halfHV[64];\
1928
    copy_block9(full, src, 16, stride, 9);\
1929
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1931
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1933
}\
1934
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935
    uint8_t full[16*9];\
1936
    uint8_t halfH[72];\
1937
    uint8_t halfV[64];\
1938
    uint8_t halfHV[64];\
1939
    copy_block9(full, src, 16, stride, 9);\
1940
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1942
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1944
}\
1945
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1946
    uint8_t full[16*9];\
1947
    uint8_t halfH[72];\
1948
    uint8_t halfHV[64];\
1949
    copy_block9(full, src, 16, stride, 9);\
1950
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1951
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1952
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1954
}\
1955
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[16*9];\
1957
    uint8_t halfH[72];\
1958
    uint8_t halfV[64];\
1959
    uint8_t halfHV[64];\
1960
    copy_block9(full, src, 16, stride, 9);\
1961
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1962
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1965
}\
1966
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[16*9];\
1968
    uint8_t halfH[72];\
1969
    uint8_t halfHV[64];\
1970
    copy_block9(full, src, 16, stride, 9);\
1971
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1974
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1975
}\
1976
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1977
    uint8_t halfH[72];\
1978
    uint8_t halfHV[64];\
1979
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1980
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1981
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1982
}\
1983
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t halfH[72];\
1985
    uint8_t halfHV[64];\
1986
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1987
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1988
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1989
}\
1990
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991
    uint8_t full[16*9];\
1992
    uint8_t halfH[72];\
1993
    uint8_t halfV[64];\
1994
    uint8_t halfHV[64];\
1995
    copy_block9(full, src, 16, stride, 9);\
1996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1998
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1999
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2000
}\
2001
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2002
    uint8_t full[16*9];\
2003
    uint8_t halfH[72];\
2004
    copy_block9(full, src, 16, stride, 9);\
2005
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2006
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2007
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2008
}\
2009
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    uint8_t full[16*9];\
2011
    uint8_t halfH[72];\
2012
    uint8_t halfV[64];\
2013
    uint8_t halfHV[64];\
2014
    copy_block9(full, src, 16, stride, 9);\
2015
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2016
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2019
}\
2020
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t full[16*9];\
2022
    uint8_t halfH[72];\
2023
    copy_block9(full, src, 16, stride, 9);\
2024
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2025
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2026
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2027
}\
2028
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2029
    uint8_t halfH[72];\
2030
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2031
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2032
}\
2033
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2034
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2035
}\
2036
\
2037
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2038
    uint8_t half[256];\
2039
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2040
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2041
}\
2042
\
2043
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2044
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2045
}\
2046
\
2047
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2048
    uint8_t half[256];\
2049
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2050
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2051
}\
2052
\
2053
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2054
    uint8_t full[24*17];\
2055
    uint8_t half[256];\
2056
    copy_block17(full, src, 24, stride, 17);\
2057
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2058
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2059
}\
2060
\
2061
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2062
    uint8_t full[24*17];\
2063
    copy_block17(full, src, 24, stride, 17);\
2064
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2065
}\
2066
\
2067
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2068
    uint8_t full[24*17];\
2069
    uint8_t half[256];\
2070
    copy_block17(full, src, 24, stride, 17);\
2071
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2072
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2073
}\
2074
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2075
    uint8_t full[24*17];\
2076
    uint8_t halfH[272];\
2077
    uint8_t halfV[256];\
2078
    uint8_t halfHV[256];\
2079
    copy_block17(full, src, 24, stride, 17);\
2080
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2082
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2084
}\
2085
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2086
    uint8_t full[24*17];\
2087
    uint8_t halfH[272];\
2088
    uint8_t halfHV[256];\
2089
    copy_block17(full, src, 24, stride, 17);\
2090
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2091
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2092
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2094
}\
2095
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096
    uint8_t full[24*17];\
2097
    uint8_t halfH[272];\
2098
    uint8_t halfV[256];\
2099
    uint8_t halfHV[256];\
2100
    copy_block17(full, src, 24, stride, 17);\
2101
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2105
}\
2106
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2107
    uint8_t full[24*17];\
2108
    uint8_t halfH[272];\
2109
    uint8_t halfHV[256];\
2110
    copy_block17(full, src, 24, stride, 17);\
2111
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2114
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2115
}\
2116
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2117
    uint8_t full[24*17];\
2118
    uint8_t halfH[272];\
2119
    uint8_t halfV[256];\
2120
    uint8_t halfHV[256];\
2121
    copy_block17(full, src, 24, stride, 17);\
2122
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2124
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2126
}\
2127
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2128
    uint8_t full[24*17];\
2129
    uint8_t halfH[272];\
2130
    uint8_t halfHV[256];\
2131
    copy_block17(full, src, 24, stride, 17);\
2132
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2133
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2134
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2136
}\
2137
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t full[24*17];\
2139
    uint8_t halfH[272];\
2140
    uint8_t halfV[256];\
2141
    uint8_t halfHV[256];\
2142
    copy_block17(full, src, 24, stride, 17);\
2143
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2144
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2147
}\
2148
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2149
    uint8_t full[24*17];\
2150
    uint8_t halfH[272];\
2151
    uint8_t halfHV[256];\
2152
    copy_block17(full, src, 24, stride, 17);\
2153
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2156
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2157
}\
2158
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2159
    uint8_t halfH[272];\
2160
    uint8_t halfHV[256];\
2161
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2162
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2163
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2164
}\
2165
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2166
    uint8_t halfH[272];\
2167
    uint8_t halfHV[256];\
2168
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2169
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2170
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2171
}\
2172
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2173
    uint8_t full[24*17];\
2174
    uint8_t halfH[272];\
2175
    uint8_t halfV[256];\
2176
    uint8_t halfHV[256];\
2177
    copy_block17(full, src, 24, stride, 17);\
2178
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2180
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2181
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2182
}\
2183
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2184
    uint8_t full[24*17];\
2185
    uint8_t halfH[272];\
2186
    copy_block17(full, src, 24, stride, 17);\
2187
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2188
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2189
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2190
}\
2191
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192
    uint8_t full[24*17];\
2193
    uint8_t halfH[272];\
2194
    uint8_t halfV[256];\
2195
    uint8_t halfHV[256];\
2196
    copy_block17(full, src, 24, stride, 17);\
2197
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2198
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2201
}\
2202
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2203
    uint8_t full[24*17];\
2204
    uint8_t halfH[272];\
2205
    copy_block17(full, src, 24, stride, 17);\
2206
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2207
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2208
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2209
}\
2210
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2211
    uint8_t halfH[272];\
2212
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2213
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2214
}
2215

    
2216
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2217
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2218
#define op_put(a, b) a = cm[((b) + 16)>>5]
2219
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2220

    
2221
QPEL_MC(0, put_       , _       , op_put)
2222
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2223
QPEL_MC(0, avg_       , _       , op_avg)
2224
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2225
#undef op_avg
2226
#undef op_avg_no_rnd
2227
#undef op_put
2228
#undef op_put_no_rnd
2229

    
2230
#if 1
2231
#define H264_LOWPASS(OPNAME, OP, OP2) \
2232
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2233
    const int h=2;\
2234
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2235
    int i;\
2236
    for(i=0; i<h; i++)\
2237
    {\
2238
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2239
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2240
        dst+=dstStride;\
2241
        src+=srcStride;\
2242
    }\
2243
}\
2244
\
2245
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2246
    const int w=2;\
2247
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248
    int i;\
2249
    for(i=0; i<w; i++)\
2250
    {\
2251
        const int srcB= src[-2*srcStride];\
2252
        const int srcA= src[-1*srcStride];\
2253
        const int src0= src[0 *srcStride];\
2254
        const int src1= src[1 *srcStride];\
2255
        const int src2= src[2 *srcStride];\
2256
        const int src3= src[3 *srcStride];\
2257
        const int src4= src[4 *srcStride];\
2258
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2259
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2260
        dst++;\
2261
        src++;\
2262
    }\
2263
}\
2264
\
2265
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2266
    const int h=2;\
2267
    const int w=2;\
2268
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269
    int i;\
2270
    src -= 2*srcStride;\
2271
    for(i=0; i<h+5; i++)\
2272
    {\
2273
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2274
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2275
        tmp+=tmpStride;\
2276
        src+=srcStride;\
2277
    }\
2278
    tmp -= tmpStride*(h+5-2);\
2279
    for(i=0; i<w; i++)\
2280
    {\
2281
        const int tmpB= tmp[-2*tmpStride];\
2282
        const int tmpA= tmp[-1*tmpStride];\
2283
        const int tmp0= tmp[0 *tmpStride];\
2284
        const int tmp1= tmp[1 *tmpStride];\
2285
        const int tmp2= tmp[2 *tmpStride];\
2286
        const int tmp3= tmp[3 *tmpStride];\
2287
        const int tmp4= tmp[4 *tmpStride];\
2288
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290
        dst++;\
2291
        tmp++;\
2292
    }\
2293
}\
2294
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295
    const int h=4;\
2296
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297
    int i;\
2298
    for(i=0; i<h; i++)\
2299
    {\
2300
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2303
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2304
        dst+=dstStride;\
2305
        src+=srcStride;\
2306
    }\
2307
}\
2308
\
2309
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2310
    const int w=4;\
2311
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2312
    int i;\
2313
    for(i=0; i<w; i++)\
2314
    {\
2315
        const int srcB= src[-2*srcStride];\
2316
        const int srcA= src[-1*srcStride];\
2317
        const int src0= src[0 *srcStride];\
2318
        const int src1= src[1 *srcStride];\
2319
        const int src2= src[2 *srcStride];\
2320
        const int src3= src[3 *srcStride];\
2321
        const int src4= src[4 *srcStride];\
2322
        const int src5= src[5 *srcStride];\
2323
        const int src6= src[6 *srcStride];\
2324
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2325
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2326
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2327
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2328
        dst++;\
2329
        src++;\
2330
    }\
2331
}\
2332
\
2333
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2334
    const int h=4;\
2335
    const int w=4;\
2336
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337
    int i;\
2338
    src -= 2*srcStride;\
2339
    for(i=0; i<h+5; i++)\
2340
    {\
2341
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2342
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2343
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2344
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2345
        tmp+=tmpStride;\
2346
        src+=srcStride;\
2347
    }\
2348
    tmp -= tmpStride*(h+5-2);\
2349
    for(i=0; i<w; i++)\
2350
    {\
2351
        const int tmpB= tmp[-2*tmpStride];\
2352
        const int tmpA= tmp[-1*tmpStride];\
2353
        const int tmp0= tmp[0 *tmpStride];\
2354
        const int tmp1= tmp[1 *tmpStride];\
2355
        const int tmp2= tmp[2 *tmpStride];\
2356
        const int tmp3= tmp[3 *tmpStride];\
2357
        const int tmp4= tmp[4 *tmpStride];\
2358
        const int tmp5= tmp[5 *tmpStride];\
2359
        const int tmp6= tmp[6 *tmpStride];\
2360
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2361
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2362
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2363
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2364
        dst++;\
2365
        tmp++;\
2366
    }\
2367
}\
2368
\
2369
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2370
    const int h=8;\
2371
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2372
    int i;\
2373
    for(i=0; i<h; i++)\
2374
    {\
2375
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2376
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2377
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2378
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2379
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2380
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2381
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2382
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2383
        dst+=dstStride;\
2384
        src+=srcStride;\
2385
    }\
2386
}\
2387
\
2388
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389
    const int w=8;\
2390
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2391
    int i;\
2392
    for(i=0; i<w; i++)\
2393
    {\
2394
        const int srcB= src[-2*srcStride];\
2395
        const int srcA= src[-1*srcStride];\
2396
        const int src0= src[0 *srcStride];\
2397
        const int src1= src[1 *srcStride];\
2398
        const int src2= src[2 *srcStride];\
2399
        const int src3= src[3 *srcStride];\
2400
        const int src4= src[4 *srcStride];\
2401
        const int src5= src[5 *srcStride];\
2402
        const int src6= src[6 *srcStride];\
2403
        const int src7= src[7 *srcStride];\
2404
        const int src8= src[8 *srcStride];\
2405
        const int src9= src[9 *srcStride];\
2406
        const int src10=src[10*srcStride];\
2407
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2408
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2409
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2410
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2411
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2412
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2413
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2414
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2415
        dst++;\
2416
        src++;\
2417
    }\
2418
}\
2419
\
2420
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2421
    const int h=8;\
2422
    const int w=8;\
2423
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2424
    int i;\
2425
    src -= 2*srcStride;\
2426
    for(i=0; i<h+5; i++)\
2427
    {\
2428
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2429
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2430
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2431
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2432
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2433
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2434
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2435
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2436
        tmp+=tmpStride;\
2437
        src+=srcStride;\
2438
    }\
2439
    tmp -= tmpStride*(h+5-2);\
2440
    for(i=0; i<w; i++)\
2441
    {\
2442
        const int tmpB= tmp[-2*tmpStride];\
2443
        const int tmpA= tmp[-1*tmpStride];\
2444
        const int tmp0= tmp[0 *tmpStride];\
2445
        const int tmp1= tmp[1 *tmpStride];\
2446
        const int tmp2= tmp[2 *tmpStride];\
2447
        const int tmp3= tmp[3 *tmpStride];\
2448
        const int tmp4= tmp[4 *tmpStride];\
2449
        const int tmp5= tmp[5 *tmpStride];\
2450
        const int tmp6= tmp[6 *tmpStride];\
2451
        const int tmp7= tmp[7 *tmpStride];\
2452
        const int tmp8= tmp[8 *tmpStride];\
2453
        const int tmp9= tmp[9 *tmpStride];\
2454
        const int tmp10=tmp[10*tmpStride];\
2455
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2456
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2457
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2458
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2459
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2460
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2461
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2462
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2463
        dst++;\
2464
        tmp++;\
2465
    }\
2466
}\
2467
\
2468
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2469
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2470
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2471
    src += 8*srcStride;\
2472
    dst += 8*dstStride;\
2473
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2474
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2475
}\
2476
\
2477
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2478
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2479
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2480
    src += 8*srcStride;\
2481
    dst += 8*dstStride;\
2482
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2483
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2484
}\
2485
\
2486
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2487
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2488
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2489
    src += 8*srcStride;\
2490
    dst += 8*dstStride;\
2491
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2492
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2493
}\
2494

    
2495
#define H264_MC(OPNAME, SIZE) \
2496
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2497
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2498
}\
2499
\
2500
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2501
    uint8_t half[SIZE*SIZE];\
2502
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2503
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2504
}\
2505
\
2506
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2507
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2508
}\
2509
\
2510
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2511
    uint8_t half[SIZE*SIZE];\
2512
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2513
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2514
}\
2515
\
2516
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2517
    uint8_t full[SIZE*(SIZE+5)];\
2518
    uint8_t * const full_mid= full + SIZE*2;\
2519
    uint8_t half[SIZE*SIZE];\
2520
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2521
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2522
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2523
}\
2524
\
2525
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2526
    uint8_t full[SIZE*(SIZE+5)];\
2527
    uint8_t * const full_mid= full + SIZE*2;\
2528
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2529
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2530
}\
2531
\
2532
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2533
    uint8_t full[SIZE*(SIZE+5)];\
2534
    uint8_t * const full_mid= full + SIZE*2;\
2535
    uint8_t half[SIZE*SIZE];\
2536
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2537
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2538
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2539
}\
2540
\
2541
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2542
    uint8_t full[SIZE*(SIZE+5)];\
2543
    uint8_t * const full_mid= full + SIZE*2;\
2544
    uint8_t halfH[SIZE*SIZE];\
2545
    uint8_t halfV[SIZE*SIZE];\
2546
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2547
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2548
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2549
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2550
}\
2551
\
2552
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2553
    uint8_t full[SIZE*(SIZE+5)];\
2554
    uint8_t * const full_mid= full + SIZE*2;\
2555
    uint8_t halfH[SIZE*SIZE];\
2556
    uint8_t halfV[SIZE*SIZE];\
2557
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2558
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2559
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2560
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2561
}\
2562
\
2563
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2564
    uint8_t full[SIZE*(SIZE+5)];\
2565
    uint8_t * const full_mid= full + SIZE*2;\
2566
    uint8_t halfH[SIZE*SIZE];\
2567
    uint8_t halfV[SIZE*SIZE];\
2568
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2569
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2570
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2571
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2572
}\
2573
\
2574
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2575
    uint8_t full[SIZE*(SIZE+5)];\
2576
    uint8_t * const full_mid= full + SIZE*2;\
2577
    uint8_t halfH[SIZE*SIZE];\
2578
    uint8_t halfV[SIZE*SIZE];\
2579
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2580
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2581
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2583
}\
2584
\
2585
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2586
    int16_t tmp[SIZE*(SIZE+5)];\
2587
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2588
}\
2589
\
2590
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2591
    int16_t tmp[SIZE*(SIZE+5)];\
2592
    uint8_t halfH[SIZE*SIZE];\
2593
    uint8_t halfHV[SIZE*SIZE];\
2594
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2595
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2596
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2597
}\
2598
\
2599
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2600
    int16_t tmp[SIZE*(SIZE+5)];\
2601
    uint8_t halfH[SIZE*SIZE];\
2602
    uint8_t halfHV[SIZE*SIZE];\
2603
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2604
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2605
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2606
}\
2607
\
2608
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2609
    uint8_t full[SIZE*(SIZE+5)];\
2610
    uint8_t * const full_mid= full + SIZE*2;\
2611
    int16_t tmp[SIZE*(SIZE+5)];\
2612
    uint8_t halfV[SIZE*SIZE];\
2613
    uint8_t halfHV[SIZE*SIZE];\
2614
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2615
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2616
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2617
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2618
}\
2619
\
2620
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2621
    uint8_t full[SIZE*(SIZE+5)];\
2622
    uint8_t * const full_mid= full + SIZE*2;\
2623
    int16_t tmp[SIZE*(SIZE+5)];\
2624
    uint8_t halfV[SIZE*SIZE];\
2625
    uint8_t halfHV[SIZE*SIZE];\
2626
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2627
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2629
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2630
}\
2631

    
2632
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2633
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2634
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2635
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2636
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2637

    
2638
H264_LOWPASS(put_       , op_put, op2_put)
2639
H264_LOWPASS(avg_       , op_avg, op2_avg)
2640
H264_MC(put_, 2)
2641
H264_MC(put_, 4)
2642
H264_MC(put_, 8)
2643
H264_MC(put_, 16)
2644
H264_MC(avg_, 4)
2645
H264_MC(avg_, 8)
2646
H264_MC(avg_, 16)
2647

    
2648
#undef op_avg
2649
#undef op_put
2650
#undef op2_avg
2651
#undef op2_put
2652
#endif
2653

    
2654
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2655
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2656
#define H264_WEIGHT(W,H) \
2657
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2658
    int y; \
2659
    offset <<= log2_denom; \
2660
    if(log2_denom) offset += 1<<(log2_denom-1); \
2661
    for(y=0; y<H; y++, block += stride){ \
2662
        op_scale1(0); \
2663
        op_scale1(1); \
2664
        if(W==2) continue; \
2665
        op_scale1(2); \
2666
        op_scale1(3); \
2667
        if(W==4) continue; \
2668
        op_scale1(4); \
2669
        op_scale1(5); \
2670
        op_scale1(6); \
2671
        op_scale1(7); \
2672
        if(W==8) continue; \
2673
        op_scale1(8); \
2674
        op_scale1(9); \
2675
        op_scale1(10); \
2676
        op_scale1(11); \
2677
        op_scale1(12); \
2678
        op_scale1(13); \
2679
        op_scale1(14); \
2680
        op_scale1(15); \
2681
    } \
2682
} \
2683
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2684
    int y; \
2685
    offset = ((offset + 1) | 1) << log2_denom; \
2686
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2687
        op_scale2(0); \
2688
        op_scale2(1); \
2689
        if(W==2) continue; \
2690
        op_scale2(2); \
2691
        op_scale2(3); \
2692
        if(W==4) continue; \
2693
        op_scale2(4); \
2694
        op_scale2(5); \
2695
        op_scale2(6); \
2696
        op_scale2(7); \
2697
        if(W==8) continue; \
2698
        op_scale2(8); \
2699
        op_scale2(9); \
2700
        op_scale2(10); \
2701
        op_scale2(11); \
2702
        op_scale2(12); \
2703
        op_scale2(13); \
2704
        op_scale2(14); \
2705
        op_scale2(15); \
2706
    } \
2707
}
2708

    
2709
H264_WEIGHT(16,16)
2710
H264_WEIGHT(16,8)
2711
H264_WEIGHT(8,16)
2712
H264_WEIGHT(8,8)
2713
H264_WEIGHT(8,4)
2714
H264_WEIGHT(4,8)
2715
H264_WEIGHT(4,4)
2716
H264_WEIGHT(4,2)
2717
H264_WEIGHT(2,4)
2718
H264_WEIGHT(2,2)
2719

    
2720
#undef op_scale1
2721
#undef op_scale2
2722
#undef H264_WEIGHT
2723

    
2724
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2725
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2726
    int i;
2727

    
2728
    for(i=0; i<h; i++){
2729
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2730
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2731
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2732
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2733
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2734
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2735
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2736
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2737
        dst+=dstStride;
2738
        src+=srcStride;
2739
    }
2740
}
2741

    
2742
#if CONFIG_CAVS_DECODER
2743
/* AVS specific */
2744
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2745

    
2746
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747
    put_pixels8_c(dst, src, stride, 8);
2748
}
2749
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750
    avg_pixels8_c(dst, src, stride, 8);
2751
}
2752
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753
    put_pixels16_c(dst, src, stride, 16);
2754
}
2755
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2756
    avg_pixels16_c(dst, src, stride, 16);
2757
}
2758
#endif /* CONFIG_CAVS_DECODER */
2759

    
2760
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2761

    
2762
#if CONFIG_VC1_DECODER
2763
/* VC-1 specific */
2764
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2765

    
2766
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767
    put_pixels8_c(dst, src, stride, 8);
2768
}
2769
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2770
    avg_pixels8_c(dst, src, stride, 8);
2771
}
2772
#endif /* CONFIG_VC1_DECODER */
2773

    
2774
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2775

    
2776
/* H264 specific */
2777
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2778

    
2779
#if CONFIG_RV30_DECODER
2780
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2781
#endif /* CONFIG_RV30_DECODER */
2782

    
2783
#if CONFIG_RV40_DECODER
2784
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785
    put_pixels16_xy2_c(dst, src, stride, 16);
2786
}
2787
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788
    avg_pixels16_xy2_c(dst, src, stride, 16);
2789
}
2790
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791
    put_pixels8_xy2_c(dst, src, stride, 8);
2792
}
2793
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2794
    avg_pixels8_xy2_c(dst, src, stride, 8);
2795
}
2796

    
2797
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2798
#endif /* CONFIG_RV40_DECODER */
2799

    
2800
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2801
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2802
    int i;
2803

    
2804
    for(i=0; i<w; i++){
2805
        const int src_1= src[ -srcStride];
2806
        const int src0 = src[0          ];
2807
        const int src1 = src[  srcStride];
2808
        const int src2 = src[2*srcStride];
2809
        const int src3 = src[3*srcStride];
2810
        const int src4 = src[4*srcStride];
2811
        const int src5 = src[5*srcStride];
2812
        const int src6 = src[6*srcStride];
2813
        const int src7 = src[7*srcStride];
2814
        const int src8 = src[8*srcStride];
2815
        const int src9 = src[9*srcStride];
2816
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2817
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2818
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2819
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2820
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2821
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2822
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2823
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2824
        src++;
2825
        dst++;
2826
    }
2827
}
2828

    
2829
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2830
    put_pixels8_c(dst, src, stride, 8);
2831
}
2832

    
2833
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2834
    uint8_t half[64];
2835
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2836
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2837
}
2838

    
2839
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2840
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2841
}
2842

    
2843
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2844
    uint8_t half[64];
2845
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2846
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2847
}
2848

    
2849
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2850
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2851
}
2852

    
2853
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2854
    uint8_t halfH[88];
2855
    uint8_t halfV[64];
2856
    uint8_t halfHV[64];
2857
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2858
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2859
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2860
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2861
}
2862
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2863
    uint8_t halfH[88];
2864
    uint8_t halfV[64];
2865
    uint8_t halfHV[64];
2866
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2867
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2868
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2869
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2870
}
2871
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2872
    uint8_t halfH[88];
2873
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2874
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2875
}
2876

    
2877
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2878
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2879
    int x;
2880
    const int strength= ff_h263_loop_filter_strength[qscale];
2881

    
2882
    for(x=0; x<8; x++){
2883
        int d1, d2, ad1;
2884
        int p0= src[x-2*stride];
2885
        int p1= src[x-1*stride];
2886
        int p2= src[x+0*stride];
2887
        int p3= src[x+1*stride];
2888
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2889

    
2890
        if     (d<-2*strength) d1= 0;
2891
        else if(d<-  strength) d1=-2*strength - d;
2892
        else if(d<   strength) d1= d;
2893
        else if(d< 2*strength) d1= 2*strength - d;
2894
        else                   d1= 0;
2895

    
2896
        p1 += d1;
2897
        p2 -= d1;
2898
        if(p1&256) p1= ~(p1>>31);
2899
        if(p2&256) p2= ~(p2>>31);
2900

    
2901
        src[x-1*stride] = p1;
2902
        src[x+0*stride] = p2;
2903

    
2904
        ad1= FFABS(d1)>>1;
2905

    
2906
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2907

    
2908
        src[x-2*stride] = p0 - d2;
2909
        src[x+  stride] = p3 + d2;
2910
    }
2911
    }
2912
}
2913

    
2914
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2915
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2916
    int y;
2917
    const int strength= ff_h263_loop_filter_strength[qscale];
2918

    
2919
    for(y=0; y<8; y++){
2920
        int d1, d2, ad1;
2921
        int p0= src[y*stride-2];
2922
        int p1= src[y*stride-1];
2923
        int p2= src[y*stride+0];
2924
        int p3= src[y*stride+1];
2925
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2926

    
2927
        if     (d<-2*strength) d1= 0;
2928
        else if(d<-  strength) d1=-2*strength - d;
2929
        else if(d<   strength) d1= d;
2930
        else if(d< 2*strength) d1= 2*strength - d;
2931
        else                   d1= 0;
2932

    
2933
        p1 += d1;
2934
        p2 -= d1;
2935
        if(p1&256) p1= ~(p1>>31);
2936
        if(p2&256) p2= ~(p2>>31);
2937

    
2938
        src[y*stride-1] = p1;
2939
        src[y*stride+0] = p2;
2940

    
2941
        ad1= FFABS(d1)>>1;
2942

    
2943
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2944

    
2945
        src[y*stride-2] = p0 - d2;
2946
        src[y*stride+1] = p3 + d2;
2947
    }
2948
    }
2949
}
2950

    
2951
static void h261_loop_filter_c(uint8_t *src, int stride){
2952
    int x,y,xy,yz;
2953
    int temp[64];
2954

    
2955
    for(x=0; x<8; x++){
2956
        temp[x      ] = 4*src[x           ];
2957
        temp[x + 7*8] = 4*src[x + 7*stride];
2958
    }
2959
    for(y=1; y<7; y++){
2960
        for(x=0; x<8; x++){
2961
            xy = y * stride + x;
2962
            yz = y * 8 + x;
2963
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2964
        }
2965
    }
2966

    
2967
    for(y=0; y<8; y++){
2968
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2969
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2970
        for(x=1; x<7; x++){
2971
            xy = y * stride + x;
2972
            yz = y * 8 + x;
2973
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2974
        }
2975
    }
2976
}
2977

    
2978
static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2979
{
2980
    int i, d;
2981
    for( i = 0; i < 4; i++ ) {
2982
        if( tc0[i] < 0 ) {
2983
            pix += 4*ystride;
2984
            continue;
2985
        }
2986
        for( d = 0; d < 4; d++ ) {
2987
            const int p0 = pix[-1*xstride];
2988
            const int p1 = pix[-2*xstride];
2989
            const int p2 = pix[-3*xstride];
2990
            const int q0 = pix[0];
2991
            const int q1 = pix[1*xstride];
2992
            const int q2 = pix[2*xstride];
2993

    
2994
            if( FFABS( p0 - q0 ) < alpha &&
2995
                FFABS( p1 - p0 ) < beta &&
2996
                FFABS( q1 - q0 ) < beta ) {
2997

    
2998
                int tc = tc0[i];
2999
                int i_delta;
3000

    
3001
                if( FFABS( p2 - p0 ) < beta ) {
3002
                    if(tc0[i])
3003
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3004
                    tc++;
3005
                }
3006
                if( FFABS( q2 - q0 ) < beta ) {
3007
                    if(tc0[i])
3008
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3009
                    tc++;
3010
                }
3011

    
3012
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3013
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3014
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3015
            }
3016
            pix += ystride;
3017
        }
3018
    }
3019
}
3020
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3021
{
3022
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3023
}
3024
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3025
{
3026
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3027
}
3028

    
3029
static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3030
{
3031
    int d;
3032
    for( d = 0; d < 16; d++ ) {
3033
        const int p2 = pix[-3*xstride];
3034
        const int p1 = pix[-2*xstride];
3035
        const int p0 = pix[-1*xstride];
3036

    
3037
        const int q0 = pix[ 0*xstride];
3038
        const int q1 = pix[ 1*xstride];
3039
        const int q2 = pix[ 2*xstride];
3040

    
3041
        if( FFABS( p0 - q0 ) < alpha &&
3042
            FFABS( p1 - p0 ) < beta &&
3043
            FFABS( q1 - q0 ) < beta ) {
3044

    
3045
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3046
                if( FFABS( p2 - p0 ) < beta)
3047
                {
3048
                    const int p3 = pix[-4*xstride];
3049
                    /* p0', p1', p2' */
3050
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3051
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3052
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3053
                } else {
3054
                    /* p0' */
3055
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3056
                }
3057
                if( FFABS( q2 - q0 ) < beta)
3058
                {
3059
                    const int q3 = pix[3*xstride];
3060
                    /* q0', q1', q2' */
3061
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3062
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3063
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3064
                } else {
3065
                    /* q0' */
3066
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3067
                }
3068
            }else{
3069
                /* p0', q0' */
3070
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3071
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3072
            }
3073
        }
3074
        pix += ystride;
3075
    }
3076
}
3077
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3078
{
3079
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3080
}
3081
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3082
{
3083
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3084
}
3085

    
3086
static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3087
{
3088
    int i, d;
3089
    for( i = 0; i < 4; i++ ) {
3090
        const int tc = tc0[i];
3091
        if( tc <= 0 ) {
3092
            pix += 2*ystride;
3093
            continue;
3094
        }
3095
        for( d = 0; d < 2; d++ ) {
3096
            const int p0 = pix[-1*xstride];
3097
            const int p1 = pix[-2*xstride];
3098
            const int q0 = pix[0];
3099
            const int q1 = pix[1*xstride];
3100

    
3101
            if( FFABS( p0 - q0 ) < alpha &&
3102
                FFABS( p1 - p0 ) < beta &&
3103
                FFABS( q1 - q0 ) < beta ) {
3104

    
3105
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3106

    
3107
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3108
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3109
            }
3110
            pix += ystride;
3111
        }
3112
    }
3113
}
3114
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3115
{
3116
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3117
}
3118
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3119
{
3120
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3121
}
3122

    
3123
static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3124
{
3125
    int d;
3126
    for( d = 0; d < 8; d++ ) {
3127
        const int p0 = pix[-1*xstride];
3128
        const int p1 = pix[-2*xstride];
3129
        const int q0 = pix[0];
3130
        const int q1 = pix[1*xstride];
3131

    
3132
        if( FFABS( p0 - q0 ) < alpha &&
3133
            FFABS( p1 - p0 ) < beta &&
3134
            FFABS( q1 - q0 ) < beta ) {
3135

    
3136
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3137
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3138
        }
3139
        pix += ystride;
3140
    }
3141
}
3142
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3143
{
3144
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3145
}
3146
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3147
{
3148
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3149
}
3150

    
3151
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3152
{
3153
    int s, i;
3154

    
3155
    s = 0;
3156
    for(i=0;i<h;i++) {
3157
        s += abs(pix1[0] - pix2[0]);
3158
        s += abs(pix1[1] - pix2[1]);
3159
        s += abs(pix1[2] - pix2[2]);
3160
        s += abs(pix1[3] - pix2[3]);
3161
        s += abs(pix1[4] - pix2[4]);
3162
        s += abs(pix1[5] - pix2[5]);
3163
        s += abs(pix1[6] - pix2[6]);
3164
        s += abs(pix1[7] - pix2[7]);
3165
        s += abs(pix1[8] - pix2[8]);
3166
        s += abs(pix1[9] - pix2[9]);
3167
        s += abs(pix1[10] - pix2[10]);
3168
        s += abs(pix1[11] - pix2[11]);
3169
        s += abs(pix1[12] - pix2[12]);
3170
        s += abs(pix1[13] - pix2[13]);
3171
        s += abs(pix1[14] - pix2[14]);
3172
        s += abs(pix1[15] - pix2[15]);
3173
        pix1 += line_size;
3174
        pix2 += line_size;
3175
    }
3176
    return s;
3177
}
3178

    
3179
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3180
{
3181
    int s, i;
3182

    
3183
    s = 0;
3184
    for(i=0;i<h;i++) {
3185
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3186
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3187
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3188
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3189
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3190
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3191
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3192
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3193
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3194
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3195
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3196
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3197
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3198
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3199
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3200
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3201
        pix1 += line_size;
3202
        pix2 += line_size;
3203
    }
3204
    return s;
3205
}
3206

    
3207
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3208
{
3209
    int s, i;
3210
    uint8_t *pix3 = pix2 + line_size;
3211

    
3212
    s = 0;
3213
    for(i=0;i<h;i++) {
3214
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3215
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3216
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3217
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3218
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3219
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3220
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3221
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3222
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3223
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3224
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3225
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3226
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3227
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3228
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3229
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3230
        pix1 += line_size;
3231
        pix2 += line_size;
3232
        pix3 += line_size;
3233
    }
3234
    return s;
3235
}
3236

    
3237
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3238
{
3239
    int s, i;
3240
    uint8_t *pix3 = pix2 + line_size;
3241

    
3242
    s = 0;
3243
    for(i=0;i<h;i++) {
3244
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3245
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3246
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3247
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3248
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3249
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3250
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3251
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3252
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3253
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3254
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3255
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3256
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3257
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3258
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3259
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3260
        pix1 += line_size;
3261
        pix2 += line_size;
3262
        pix3 += line_size;
3263
    }
3264
    return s;
3265
}
3266

    
3267
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3268
{
3269
    int s, i;
3270

    
3271
    s = 0;
3272
    for(i=0;i<h;i++) {
3273
        s += abs(pix1[0] - pix2[0]);
3274
        s += abs(pix1[1] - pix2[1]);
3275
        s += abs(pix1[2] - pix2[2]);
3276
        s += abs(pix1[3] - pix2[3]);
3277
        s += abs(pix1[4] - pix2[4]);
3278
        s += abs(pix1[5] - pix2[5]);
3279
        s += abs(pix1[6] - pix2[6]);
3280
        s += abs(pix1[7] - pix2[7]);
3281
        pix1 += line_size;
3282
        pix2 += line_size;
3283
    }
3284
    return s;
3285
}
3286

    
3287
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3288
{
3289
    int s, i;
3290

    
3291
    s = 0;
3292
    for(i=0;i<h;i++) {
3293
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3294
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3295
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3296
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3297
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3298
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3299
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3300
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3301
        pix1 += line_size;
3302
        pix2 += line_size;
3303
    }
3304
    return s;
3305
}
3306

    
3307
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3308
{
3309
    int s, i;
3310
    uint8_t *pix3 = pix2 + line_size;
3311

    
3312
    s = 0;
3313
    for(i=0;i<h;i++) {
3314
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3315
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3316
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3317
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3318
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3319
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3320
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3321
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3322
        pix1 += line_size;
3323
        pix2 += line_size;
3324
        pix3 += line_size;
3325
    }
3326
    return s;
3327
}
3328

    
3329
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3330
{
3331
    int s, i;
3332
    uint8_t *pix3 = pix2 + line_size;
3333

    
3334
    s = 0;
3335
    for(i=0;i<h;i++) {
3336
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3337
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3338
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3339
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3340
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3341
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3342
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3343
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3344
        pix1 += line_size;
3345
        pix2 += line_size;
3346
        pix3 += line_size;
3347
    }
3348
    return s;
3349
}
3350

    
3351
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3352
    MpegEncContext *c = v;
3353
    int score1=0;
3354
    int score2=0;
3355
    int x,y;
3356

    
3357
    for(y=0; y<h; y++){
3358
        for(x=0; x<16; x++){
3359
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3360
        }
3361
        if(y+1<h){
3362
            for(x=0; x<15; x++){
3363
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3364
                             - s1[x+1] + s1[x+1+stride])
3365
                        -FFABS(  s2[x  ] - s2[x  +stride]
3366
                             - s2[x+1] + s2[x+1+stride]);
3367
            }
3368
        }
3369
        s1+= stride;
3370
        s2+= stride;
3371
    }
3372

    
3373
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3374
    else  return score1 + FFABS(score2)*8;
3375
}
3376

    
3377
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3378
    MpegEncContext *c = v;
3379
    int score1=0;
3380
    int score2=0;
3381
    int x,y;
3382

    
3383
    for(y=0; y<h; y++){
3384
        for(x=0; x<8; x++){
3385
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3386
        }
3387
        if(y+1<h){
3388
            for(x=0; x<7; x++){
3389
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3390
                             - s1[x+1] + s1[x+1+stride])
3391
                        -FFABS(  s2[x  ] - s2[x  +stride]
3392
                             - s2[x+1] + s2[x+1+stride]);
3393
            }
3394
        }
3395
        s1+= stride;
3396
        s2+= stride;
3397
    }
3398

    
3399
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3400
    else  return score1 + FFABS(score2)*8;
3401
}
3402

    
3403
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3404
    int i;
3405
    unsigned int sum=0;
3406

    
3407
    for(i=0; i<8*8; i++){
3408
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3409
        int w= weight[i];
3410
        b>>= RECON_SHIFT;
3411
        assert(-512<b && b<512);
3412

    
3413
        sum += (w*b)*(w*b)>>4;
3414
    }
3415
    return sum>>2;
3416
}
3417

    
3418
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3419
    int i;
3420

    
3421
    for(i=0; i<8*8; i++){
3422
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3423
    }
3424
}
3425

    
3426
/**
3427
 * permutes an 8x8 block.
3428
 * @param block the block which will be permuted according to the given permutation vector
3429
 * @param permutation the permutation vector
3430
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3431
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3432
 *                  (inverse) permutated to scantable order!
3433
 */
3434
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3435
{
3436
    int i;
3437
    DCTELEM temp[64];
3438

    
3439
    if(last<=0) return;
3440
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3441

    
3442
    for(i=0; i<=last; i++){
3443
        const int j= scantable[i];
3444
        temp[j]= block[j];
3445
        block[j]=0;
3446
    }
3447

    
3448
    for(i=0; i<=last; i++){
3449
        const int j= scantable[i];
3450
        const int perm_j= permutation[j];
3451
        block[perm_j]= temp[j];
3452
    }
3453
}
3454

    
3455
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3456
    return 0;
3457
}
3458

    
3459
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3460
    int i;
3461

    
3462
    memset(cmp, 0, sizeof(void*)*6);
3463

    
3464
    for(i=0; i<6; i++){
3465
        switch(type&0xFF){
3466
        case FF_CMP_SAD:
3467
            cmp[i]= c->sad[i];
3468
            break;
3469
        case FF_CMP_SATD:
3470
            cmp[i]= c->hadamard8_diff[i];
3471
            break;
3472
        case FF_CMP_SSE:
3473
            cmp[i]= c->sse[i];
3474
            break;
3475
        case FF_CMP_DCT:
3476
            cmp[i]= c->dct_sad[i];
3477
            break;
3478
        case FF_CMP_DCT264:
3479
            cmp[i]= c->dct264_sad[i];
3480
            break;
3481
        case FF_CMP_DCTMAX:
3482
            cmp[i]= c->dct_max[i];
3483
            break;
3484
        case FF_CMP_PSNR:
3485
            cmp[i]= c->quant_psnr[i];
3486
            break;
3487
        case FF_CMP_BIT:
3488
            cmp[i]= c->bit[i];
3489
            break;
3490
        case FF_CMP_RD:
3491
            cmp[i]= c->rd[i];
3492
            break;
3493
        case FF_CMP_VSAD:
3494
            cmp[i]= c->vsad[i];
3495
            break;
3496
        case FF_CMP_VSSE:
3497
            cmp[i]= c->vsse[i];
3498
            break;
3499
        case FF_CMP_ZERO:
3500
            cmp[i]= zero_cmp;
3501
            break;
3502
        case FF_CMP_NSSE:
3503
            cmp[i]= c->nsse[i];
3504
            break;
3505
#if CONFIG_SNOW_ENCODER
3506
        case FF_CMP_W53:
3507
            cmp[i]= c->w53[i];
3508
            break;
3509
        case FF_CMP_W97:
3510
            cmp[i]= c->w97[i];
3511
            break;
3512
#endif
3513
        default:
3514
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3515
        }
3516
    }
3517
}
3518

    
3519
static void clear_block_c(DCTELEM *block)
3520
{
3521
    memset(block, 0, sizeof(DCTELEM)*64);
3522
}
3523

    
3524
/**
3525
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3526
 */
3527
static void clear_blocks_c(DCTELEM *blocks)
3528
{
3529
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3530
}
3531

    
3532
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3533
    long i;
3534
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3535
        long a = *(long*)(src+i);
3536
        long b = *(long*)(dst+i);
3537
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3538
    }
3539
    for(; i<w; i++)
3540
        dst[i+0] += src[i+0];
3541
}
3542

    
3543
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3544
    long i;
3545
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3546
        long a = *(long*)(src1+i);
3547
        long b = *(long*)(src2+i);
3548
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3549
    }
3550
    for(; i<w; i++)
3551
        dst[i] = src1[i]+src2[i];
3552
}
3553

    
3554
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3555
    long i;
3556
#if !HAVE_FAST_UNALIGNED
3557
    if((long)src2 & (sizeof(long)-1)){
3558
        for(i=0; i+7<w; i+=8){
3559
            dst[i+0] = src1[i+0]-src2[i+0];
3560
            dst[i+1] = src1[i+1]-src2[i+1];
3561
            dst[i+2] = src1[i+2]-src2[i+2];
3562
            dst[i+3] = src1[i+3]-src2[i+3];
3563
            dst[i+4] = src1[i+4]-src2[i+4];
3564
            dst[i+5] = src1[i+5]-src2[i+5];
3565
            dst[i+6] = src1[i+6]-src2[i+6];
3566
            dst[i+7] = src1[i+7]-src2[i+7];
3567
        }
3568
    }else
3569
#endif
3570
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3571
        long a = *(long*)(src1+i);
3572
        long b = *(long*)(src2+i);
3573
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3574
    }
3575
    for(; i<w; i++)
3576
        dst[i+0] = src1[i+0]-src2[i+0];
3577
}
3578

    
3579
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3580
    int i;
3581
    uint8_t l, lt;
3582

    
3583
    l= *left;
3584
    lt= *left_top;
3585

    
3586
    for(i=0; i<w; i++){
3587
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3588
        lt= src1[i];
3589
        dst[i]= l;
3590
    }
3591

    
3592
    *left= l;
3593
    *left_top= lt;
3594
}
3595

    
3596
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3597
    int i;
3598
    uint8_t l, lt;
3599

    
3600
    l= *left;
3601
    lt= *left_top;
3602

    
3603
    for(i=0; i<w; i++){
3604
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3605
        lt= src1[i];
3606
        l= src2[i];
3607
        dst[i]= l - pred;
3608
    }
3609

    
3610
    *left= l;
3611
    *left_top= lt;
3612
}
3613

    
3614
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3615
    int i;
3616

    
3617
    for(i=0; i<w-1; i++){
3618
        acc+= src[i];
3619
        dst[i]= acc;
3620
        i++;
3621
        acc+= src[i];
3622
        dst[i]= acc;
3623
    }
3624

    
3625
    for(; i<w; i++){
3626
        acc+= src[i];
3627
        dst[i]= acc;
3628
    }
3629

    
3630
    return acc;
3631
}
3632

    
3633
#if HAVE_BIGENDIAN
3634
#define B 3
3635
#define G 2
3636
#define R 1
3637
#define A 0
3638
#else
3639
#define B 0
3640
#define G 1
3641
#define R 2
3642
#define A 3
3643
#endif
3644
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3645
    int i;
3646
    int r,g,b,a;
3647
    r= *red;
3648
    g= *green;
3649
    b= *blue;
3650
    a= *alpha;
3651

    
3652
    for(i=0; i<w; i++){
3653
        b+= src[4*i+B];
3654
        g+= src[4*i+G];
3655
        r+= src[4*i+R];
3656
        a+= src[4*i+A];
3657

    
3658
        dst[4*i+B]= b;
3659
        dst[4*i+G]= g;
3660
        dst[4*i+R]= r;
3661
        dst[4*i+A]= a;
3662
    }
3663

    
3664
    *red= r;
3665
    *green= g;
3666
    *blue= b;
3667
    *alpha= a;
3668
}
3669
#undef B
3670
#undef G
3671
#undef R
3672
#undef A
3673

    
3674
#define BUTTERFLY2(o1,o2,i1,i2) \
3675
o1= (i1)+(i2);\
3676
o2= (i1)-(i2);
3677

    
3678
#define BUTTERFLY1(x,y) \
3679
{\
3680
    int a,b;\
3681
    a= x;\
3682
    b= y;\
3683
    x= a+b;\
3684
    y= a-b;\
3685
}
3686

    
3687
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3688

    
3689
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3690
    int i;
3691
    int temp[64];
3692
    int sum=0;
3693

    
3694
    assert(h==8);
3695

    
3696
    for(i=0; i<8; i++){
3697
        //FIXME try pointer walks
3698
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3699
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3700
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3701
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3702

    
3703
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3704
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3705
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3706
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3707

    
3708
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3709
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3710
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3711
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3712
    }
3713

    
3714
    for(i=0; i<8; i++){
3715
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3716
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3717
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3718
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3719

    
3720
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3721
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3722
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3723
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3724

    
3725
        sum +=
3726
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3727
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3728
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3729
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3730
    }
3731
#if 0
3732
static int maxi=0;
3733
if(sum>maxi){
3734
    maxi=sum;
3735
    printf("MAX:%d\n", maxi);
3736
}
3737
#endif
3738
    return sum;
3739
}
3740

    
3741
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3742
    int i;
3743
    int temp[64];
3744
    int sum=0;
3745

    
3746
    assert(h==8);
3747

    
3748
    for(i=0; i<8; i++){
3749
        //FIXME try pointer walks
3750
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3751
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3752
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3753
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3754

    
3755
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3756
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3757
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3758
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3759

    
3760
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3761
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3762
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3763
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3764
    }
3765

    
3766
    for(i=0; i<8; i++){
3767
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3768
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3769
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3770
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3771

    
3772
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3773
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3774
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3775
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3776

    
3777
        sum +=
3778
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3779
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3780
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3781
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3782
    }
3783

    
3784
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3785

    
3786
    return sum;
3787
}
3788

    
3789
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3790
    MpegEncContext * const s= (MpegEncContext *)c;
3791
    DECLARE_ALIGNED_16(DCTELEM, temp)[64];
3792

    
3793
    assert(h==8);
3794

    
3795
    s->dsp.diff_pixels(temp, src1, src2, stride);
3796
    s->dsp.fdct(temp);
3797
    return s->dsp.sum_abs_dctelem(temp);
3798
}
3799

    
3800
#if CONFIG_GPL
3801
#define DCT8_1D {\
3802
    const int s07 = SRC(0) + SRC(7);\
3803
    const int s16 = SRC(1) + SRC(6);\
3804
    const int s25 = SRC(2) + SRC(5);\
3805
    const int s34 = SRC(3) + SRC(4);\
3806
    const int a0 = s07 + s34;\
3807
    const int a1 = s16 + s25;\
3808
    const int a2 = s07 - s34;\
3809
    const int a3 = s16 - s25;\
3810
    const int d07 = SRC(0) - SRC(7);\
3811
    const int d16 = SRC(1) - SRC(6);\
3812
    const int d25 = SRC(2) - SRC(5);\
3813
    const int d34 = SRC(3) - SRC(4);\
3814
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3815
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3816
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3817
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3818
    DST(0,  a0 + a1     ) ;\
3819
    DST(1,  a4 + (a7>>2)) ;\
3820
    DST(2,  a2 + (a3>>1)) ;\
3821
    DST(3,  a5 + (a6>>2)) ;\
3822
    DST(4,  a0 - a1     ) ;\
3823
    DST(5,  a6 - (a5>>2)) ;\
3824
    DST(6, (a2>>1) - a3 ) ;\
3825
    DST(7, (a4>>2) - a7 ) ;\
3826
}
3827

    
3828
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3829
    MpegEncContext * const s= (MpegEncContext *)c;
3830
    DCTELEM dct[8][8];
3831
    int i;
3832
    int sum=0;
3833

    
3834
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3835

    
3836
#define SRC(x) dct[i][x]
3837
#define DST(x,v) dct[i][x]= v
3838
    for( i = 0; i < 8; i++ )
3839
        DCT8_1D
3840
#undef SRC
3841
#undef DST
3842

    
3843
#define SRC(x) dct[x][i]
3844
#define DST(x,v) sum += FFABS(v)
3845
    for( i = 0; i < 8; i++ )
3846
        DCT8_1D
3847
#undef SRC
3848
#undef DST
3849
    return sum;
3850
}
3851
#endif
3852

    
3853
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3854
    MpegEncContext * const s= (MpegEncContext *)c;
3855
    DECLARE_ALIGNED_16(DCTELEM, temp)[64];
3856
    int sum=0, i;
3857

    
3858
    assert(h==8);
3859

    
3860
    s->dsp.diff_pixels(temp, src1, src2, stride);
3861
    s->dsp.fdct(temp);
3862

    
3863
    for(i=0; i<64; i++)
3864
        sum= FFMAX(sum, FFABS(temp[i]));
3865

    
3866
    return sum;
3867
}
3868

    
3869
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3870
    MpegEncContext * const s= (MpegEncContext *)c;
3871
    DECLARE_ALIGNED_16(DCTELEM, temp)[64*2];
3872
    DCTELEM * const bak = temp+64;
3873
    int sum=0, i;
3874

    
3875
    assert(h==8);
3876
    s->mb_intra=0;
3877

    
3878
    s->dsp.diff_pixels(temp, src1, src2, stride);
3879

    
3880
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3881

    
3882
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3883
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3884
    ff_simple_idct(temp); //FIXME
3885

    
3886
    for(i=0; i<64; i++)
3887
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3888

    
3889
    return sum;
3890
}
3891

    
3892
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3893
    MpegEncContext * const s= (MpegEncContext *)c;
3894
    const uint8_t *scantable= s->intra_scantable.permutated;
3895
    DECLARE_ALIGNED_16(DCTELEM, temp)[64];
3896
    DECLARE_ALIGNED_16(uint8_t, lsrc1)[64];
3897
    DECLARE_ALIGNED_16(uint8_t, lsrc2)[64];
3898
    int i, last, run, bits, level, distortion, start_i;
3899
    const int esc_length= s->ac_esc_length;
3900
    uint8_t * length;
3901
    uint8_t * last_length;
3902

    
3903
    assert(h==8);
3904

    
3905
    copy_block8(lsrc1, src1, 8, stride, 8);
3906
    copy_block8(lsrc2, src2, 8, stride, 8);
3907

    
3908
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3909

    
3910
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3911

    
3912
    bits=0;
3913

    
3914
    if (s->mb_intra) {
3915
        start_i = 1;
3916
        length     = s->intra_ac_vlc_length;
3917
        last_length= s->intra_ac_vlc_last_length;
3918
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3919
    } else {
3920
        start_i = 0;
3921
        length     = s->inter_ac_vlc_length;
3922
        last_length= s->inter_ac_vlc_last_length;
3923
    }
3924

    
3925
    if(last>=start_i){
3926
        run=0;
3927
        for(i=start_i; i<last; i++){
3928
            int j= scantable[i];
3929
            level= temp[j];
3930

    
3931
            if(level){
3932
                level+=64;
3933
                if((level&(~127)) == 0){
3934
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3935
                }else
3936
                    bits+= esc_length;
3937
                run=0;
3938
            }else
3939
                run++;
3940
        }
3941
        i= scantable[last];
3942

    
3943
        level= temp[i] + 64;
3944

    
3945
        assert(level - 64);
3946

    
3947
        if((level&(~127)) == 0){
3948
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3949
        }else
3950
            bits+= esc_length;
3951

    
3952
    }
3953

    
3954
    if(last>=0){
3955
        if(s->mb_intra)
3956
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3957
        else
3958
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3959
    }
3960

    
3961
    s->dsp.idct_add(lsrc2, 8, temp);
3962

    
3963
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3964

    
3965
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3966
}
3967

    
3968
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3969
    MpegEncContext * const s= (MpegEncContext *)c;
3970
    const uint8_t *scantable= s->intra_scantable.permutated;
3971
    DECLARE_ALIGNED_16(DCTELEM, temp)[64];
3972
    int i, last, run, bits, level, start_i;
3973
    const int esc_length= s->ac_esc_length;
3974
    uint8_t * length;
3975
    uint8_t * last_length;
3976

    
3977
    assert(h==8);
3978

    
3979
    s->dsp.diff_pixels(temp, src1, src2, stride);
3980

    
3981
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3982

    
3983
    bits=0;
3984

    
3985
    if (s->mb_intra) {
3986
        start_i = 1;
3987
        length     = s->intra_ac_vlc_length;
3988
        last_length= s->intra_ac_vlc_last_length;
3989
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3990
    } else {
3991
        start_i = 0;
3992
        length     = s->inter_ac_vlc_length;
3993
        last_length= s->inter_ac_vlc_last_length;
3994
    }
3995

    
3996
    if(last>=start_i){
3997
        run=0;
3998
        for(i=start_i; i<last; i++){
3999
            int j= scantable[i];
4000
            level= temp[j];
4001

    
4002
            if(level){
4003
                level+=64;
4004
                if((level&(~127)) == 0){
4005
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
4006
                }else
4007
                    bits+= esc_length;
4008
                run=0;
4009
            }else
4010
                run++;
4011
        }
4012
        i= scantable[last];
4013

    
4014
        level= temp[i] + 64;
4015

    
4016
        assert(level - 64);
4017

    
4018
        if((level&(~127)) == 0){
4019
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4020
        }else
4021
            bits+= esc_length;
4022
    }
4023

    
4024
    return bits;
4025
}
4026

    
4027
#define VSAD_INTRA(size) \
4028
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4029
    int score=0;                                                                                            \
4030
    int x,y;                                                                                                \
4031
                                                                                                            \
4032
    for(y=1; y<h; y++){                                                                                     \
4033
        for(x=0; x<size; x+=4){                                                                             \
4034
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4035
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4036
        }                                                                                                   \
4037
        s+= stride;                                                                                         \
4038
    }                                                                                                       \
4039
                                                                                                            \
4040
    return score;                                                                                           \
4041
}
4042
VSAD_INTRA(8)
4043
VSAD_INTRA(16)
4044

    
4045
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4046
    int score=0;
4047
    int x,y;
4048

    
4049
    for(y=1; y<h; y++){
4050
        for(x=0; x<16; x++){
4051
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4052
        }
4053
        s1+= stride;
4054
        s2+= stride;
4055
    }
4056

    
4057
    return score;
4058
}
4059

    
4060
#define SQ(a) ((a)*(a))
4061
#define VSSE_INTRA(size) \
4062
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4063
    int score=0;                                                                                            \
4064
    int x,y;                                                                                                \
4065
                                                                                                            \
4066
    for(y=1; y<h; y++){                                                                                     \
4067
        for(x=0; x<size; x+=4){                                                                               \
4068
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4069
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4070
        }                                                                                                   \
4071
        s+= stride;                                                                                         \
4072
    }                                                                                                       \
4073
                                                                                                            \
4074
    return score;                                                                                           \
4075
}
4076
VSSE_INTRA(8)
4077
VSSE_INTRA(16)
4078

    
4079
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4080
    int score=0;
4081
    int x,y;
4082

    
4083
    for(y=1; y<h; y++){
4084
        for(x=0; x<16; x++){
4085
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4086
        }
4087
        s1+= stride;
4088
        s2+= stride;
4089
    }
4090

    
4091
    return score;
4092
}
4093

    
4094
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4095
                               int size){
4096
    int score=0;
4097
    int i;
4098
    for(i=0; i<size; i++)
4099
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4100
    return score;
4101
}
4102

    
4103
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4104
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4105
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4106
#if CONFIG_GPL
4107
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4108
#endif
4109
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4110
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4111
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4112
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4113

    
4114
static void vector_fmul_c(float *dst, const float *src, int len){
4115
    int i;
4116
    for(i=0; i<len; i++)
4117
        dst[i] *= src[i];
4118
}
4119

    
4120
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4121
    int i;
4122
    src1 += len-1;
4123
    for(i=0; i<len; i++)
4124
        dst[i] = src0[i] * src1[-i];
4125
}
4126

    
4127
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4128
    int i;
4129
    for(i=0; i<len; i++)
4130
        dst[i] = src0[i] * src1[i] + src2[i];
4131
}
4132

    
4133
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4134
    int i,j;
4135
    dst += len;
4136
    win += len;
4137
    src0+= len;
4138
    for(i=-len, j=len-1; i<0; i++, j--) {
4139
        float s0 = src0[i];
4140
        float s1 = src1[j];
4141
        float wi = win[i];
4142
        float wj = win[j];
4143
        dst[i] = s0*wj - s1*wi + add_bias;
4144
        dst[j] = s0*wi + s1*wj + add_bias;
4145
    }
4146
}
4147

    
4148
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4149
                                 int len)
4150
{
4151
    int i;
4152
    for (i = 0; i < len; i++)
4153
        dst[i] = src[i] * mul;
4154
}
4155

    
4156
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4157
                                      const float **sv, float mul, int len)
4158
{
4159
    int i;
4160
    for (i = 0; i < len; i += 2, sv++) {
4161
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4162
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4163
    }
4164
}
4165

    
4166
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4167
                                      const float **sv, float mul, int len)
4168
{
4169
    int i;
4170
    for (i = 0; i < len; i += 4, sv++) {
4171
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4172
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4173
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4174
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4175
    }
4176
}
4177

    
4178
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4179
                               int len)
4180
{
4181
    int i;
4182
    for (i = 0; i < len; i += 2, sv++) {
4183
        dst[i  ] = sv[0][0] * mul;
4184
        dst[i+1] = sv[0][1] * mul;
4185
    }
4186
}
4187

    
4188
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4189
                               int len)
4190
{
4191
    int i;
4192
    for (i = 0; i < len; i += 4, sv++) {
4193
        dst[i  ] = sv[0][0] * mul;
4194
        dst[i+1] = sv[0][1] * mul;
4195
        dst[i+2] = sv[0][2] * mul;
4196
        dst[i+3] = sv[0][3] * mul;
4197
    }
4198
}
4199

    
4200
static void butterflies_float_c(float *restrict v1, float *restrict v2,
4201
                                int len)
4202
{
4203
    int i;
4204
    for (i = 0; i < len; i++) {
4205
        float t = v1[i] - v2[i];
4206
        v1[i] += v2[i];
4207
        v2[i] = t;
4208
    }
4209
}
4210

    
4211
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4212
{
4213
    float p = 0.0;
4214
    int i;
4215

    
4216
    for (i = 0; i < len; i++)
4217
        p += v1[i] * v2[i];
4218

    
4219
    return p;
4220
}
4221

    
4222
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4223
    int i;
4224
    for(i=0; i<len; i++)
4225
        dst[i] = src[i] * mul;
4226
}
4227

    
4228
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4229
                   uint32_t maxi, uint32_t maxisign)
4230
{
4231

    
4232
    if(a > mini) return mini;
4233
    else if((a^(1<<31)) > maxisign) return maxi;
4234
    else return a;
<