Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ af818f7a

History | View | Annotate | Download (173 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "h263.h"
37
#include "snow.h"
38
#include "mpegvideo.h"
39
#include "config.h"
40

    
41
/* snow.c */
42
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
43

    
44
/* vorbis.c */
45
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
46

    
47
/* ac3dec.c */
48
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
49

    
50
/* lpc.c */
51
void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
52

    
53
/* pngdec.c */
54
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
55

    
56
/* eaidct.c */
57
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
58

    
59
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
60
uint32_t ff_squareTbl[512] = {0, };
61

    
62
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
63
#define pb_7f (~0UL/255 * 0x7f)
64
#define pb_80 (~0UL/255 * 0x80)
65

    
66
const uint8_t ff_zigzag_direct[64] = {
67
    0,   1,  8, 16,  9,  2,  3, 10,
68
    17, 24, 32, 25, 18, 11,  4,  5,
69
    12, 19, 26, 33, 40, 48, 41, 34,
70
    27, 20, 13,  6,  7, 14, 21, 28,
71
    35, 42, 49, 56, 57, 50, 43, 36,
72
    29, 22, 15, 23, 30, 37, 44, 51,
73
    58, 59, 52, 45, 38, 31, 39, 46,
74
    53, 60, 61, 54, 47, 55, 62, 63
75
};
76

    
77
/* Specific zigzag scan for 248 idct. NOTE that unlike the
78
   specification, we interleave the fields */
79
const uint8_t ff_zigzag248_direct[64] = {
80
     0,  8,  1,  9, 16, 24,  2, 10,
81
    17, 25, 32, 40, 48, 56, 33, 41,
82
    18, 26,  3, 11,  4, 12, 19, 27,
83
    34, 42, 49, 57, 50, 58, 35, 43,
84
    20, 28,  5, 13,  6, 14, 21, 29,
85
    36, 44, 51, 59, 52, 60, 37, 45,
86
    22, 30,  7, 15, 23, 31, 38, 46,
87
    53, 61, 54, 62, 39, 47, 55, 63,
88
};
89

    
90
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
91
DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
92

    
93
const uint8_t ff_alternate_horizontal_scan[64] = {
94
    0,  1,   2,  3,  8,  9, 16, 17,
95
    10, 11,  4,  5,  6,  7, 15, 14,
96
    13, 12, 19, 18, 24, 25, 32, 33,
97
    26, 27, 20, 21, 22, 23, 28, 29,
98
    30, 31, 34, 35, 40, 41, 48, 49,
99
    42, 43, 36, 37, 38, 39, 44, 45,
100
    46, 47, 50, 51, 56, 57, 58, 59,
101
    52, 53, 54, 55, 60, 61, 62, 63,
102
};
103

    
104
const uint8_t ff_alternate_vertical_scan[64] = {
105
    0,  8,  16, 24,  1,  9,  2, 10,
106
    17, 25, 32, 40, 48, 56, 57, 49,
107
    41, 33, 26, 18,  3, 11,  4, 12,
108
    19, 27, 34, 42, 50, 58, 35, 43,
109
    51, 59, 20, 28,  5, 13,  6, 14,
110
    21, 29, 36, 44, 52, 60, 37, 45,
111
    53, 61, 22, 30,  7, 15, 23, 31,
112
    38, 46, 54, 62, 39, 47, 55, 63,
113
};
114

    
115
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
116
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
117
const uint32_t ff_inverse[257]={
118
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
119
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
120
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
121
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
122
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
123
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
124
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
125
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
126
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
127
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
128
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
129
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
130
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
131
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
132
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
133
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
134
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
135
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
136
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
137
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
138
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
139
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
140
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
141
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
142
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
143
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
144
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
145
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
146
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
147
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
148
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
149
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
150
  16777216
151
};
152

    
153
/* Input permutation for the simple_idct_mmx */
154
static const uint8_t simple_mmx_permutation[64]={
155
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
156
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
157
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
158
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
159
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
160
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
161
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
162
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
163
};
164

    
165
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
166

    
167
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
168
    int i;
169
    int end;
170

    
171
    st->scantable= src_scantable;
172

    
173
    for(i=0; i<64; i++){
174
        int j;
175
        j = src_scantable[i];
176
        st->permutated[i] = permutation[j];
177
#if ARCH_PPC
178
        st->inverse[j] = i;
179
#endif
180
    }
181

    
182
    end=-1;
183
    for(i=0; i<64; i++){
184
        int j;
185
        j = st->permutated[i];
186
        if(j>end) end=j;
187
        st->raster_end[i]= end;
188
    }
189
}
190

    
191
static int pix_sum_c(uint8_t * pix, int line_size)
192
{
193
    int s, i, j;
194

    
195
    s = 0;
196
    for (i = 0; i < 16; i++) {
197
        for (j = 0; j < 16; j += 8) {
198
            s += pix[0];
199
            s += pix[1];
200
            s += pix[2];
201
            s += pix[3];
202
            s += pix[4];
203
            s += pix[5];
204
            s += pix[6];
205
            s += pix[7];
206
            pix += 8;
207
        }
208
        pix += line_size - 16;
209
    }
210
    return s;
211
}
212

    
213
static int pix_norm1_c(uint8_t * pix, int line_size)
214
{
215
    int s, i, j;
216
    uint32_t *sq = ff_squareTbl + 256;
217

    
218
    s = 0;
219
    for (i = 0; i < 16; i++) {
220
        for (j = 0; j < 16; j += 8) {
221
#if 0
222
            s += sq[pix[0]];
223
            s += sq[pix[1]];
224
            s += sq[pix[2]];
225
            s += sq[pix[3]];
226
            s += sq[pix[4]];
227
            s += sq[pix[5]];
228
            s += sq[pix[6]];
229
            s += sq[pix[7]];
230
#else
231
#if LONG_MAX > 2147483647
232
            register uint64_t x=*(uint64_t*)pix;
233
            s += sq[x&0xff];
234
            s += sq[(x>>8)&0xff];
235
            s += sq[(x>>16)&0xff];
236
            s += sq[(x>>24)&0xff];
237
            s += sq[(x>>32)&0xff];
238
            s += sq[(x>>40)&0xff];
239
            s += sq[(x>>48)&0xff];
240
            s += sq[(x>>56)&0xff];
241
#else
242
            register uint32_t x=*(uint32_t*)pix;
243
            s += sq[x&0xff];
244
            s += sq[(x>>8)&0xff];
245
            s += sq[(x>>16)&0xff];
246
            s += sq[(x>>24)&0xff];
247
            x=*(uint32_t*)(pix+4);
248
            s += sq[x&0xff];
249
            s += sq[(x>>8)&0xff];
250
            s += sq[(x>>16)&0xff];
251
            s += sq[(x>>24)&0xff];
252
#endif
253
#endif
254
            pix += 8;
255
        }
256
        pix += line_size - 16;
257
    }
258
    return s;
259
}
260

    
261
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
262
    int i;
263

    
264
    for(i=0; i+8<=w; i+=8){
265
        dst[i+0]= bswap_32(src[i+0]);
266
        dst[i+1]= bswap_32(src[i+1]);
267
        dst[i+2]= bswap_32(src[i+2]);
268
        dst[i+3]= bswap_32(src[i+3]);
269
        dst[i+4]= bswap_32(src[i+4]);
270
        dst[i+5]= bswap_32(src[i+5]);
271
        dst[i+6]= bswap_32(src[i+6]);
272
        dst[i+7]= bswap_32(src[i+7]);
273
    }
274
    for(;i<w; i++){
275
        dst[i+0]= bswap_32(src[i+0]);
276
    }
277
}
278

    
279
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
280
{
281
    int s, i;
282
    uint32_t *sq = ff_squareTbl + 256;
283

    
284
    s = 0;
285
    for (i = 0; i < h; i++) {
286
        s += sq[pix1[0] - pix2[0]];
287
        s += sq[pix1[1] - pix2[1]];
288
        s += sq[pix1[2] - pix2[2]];
289
        s += sq[pix1[3] - pix2[3]];
290
        pix1 += line_size;
291
        pix2 += line_size;
292
    }
293
    return s;
294
}
295

    
296
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
297
{
298
    int s, i;
299
    uint32_t *sq = ff_squareTbl + 256;
300

    
301
    s = 0;
302
    for (i = 0; i < h; i++) {
303
        s += sq[pix1[0] - pix2[0]];
304
        s += sq[pix1[1] - pix2[1]];
305
        s += sq[pix1[2] - pix2[2]];
306
        s += sq[pix1[3] - pix2[3]];
307
        s += sq[pix1[4] - pix2[4]];
308
        s += sq[pix1[5] - pix2[5]];
309
        s += sq[pix1[6] - pix2[6]];
310
        s += sq[pix1[7] - pix2[7]];
311
        pix1 += line_size;
312
        pix2 += line_size;
313
    }
314
    return s;
315
}
316

    
317
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
318
{
319
    int s, i;
320
    uint32_t *sq = ff_squareTbl + 256;
321

    
322
    s = 0;
323
    for (i = 0; i < h; i++) {
324
        s += sq[pix1[ 0] - pix2[ 0]];
325
        s += sq[pix1[ 1] - pix2[ 1]];
326
        s += sq[pix1[ 2] - pix2[ 2]];
327
        s += sq[pix1[ 3] - pix2[ 3]];
328
        s += sq[pix1[ 4] - pix2[ 4]];
329
        s += sq[pix1[ 5] - pix2[ 5]];
330
        s += sq[pix1[ 6] - pix2[ 6]];
331
        s += sq[pix1[ 7] - pix2[ 7]];
332
        s += sq[pix1[ 8] - pix2[ 8]];
333
        s += sq[pix1[ 9] - pix2[ 9]];
334
        s += sq[pix1[10] - pix2[10]];
335
        s += sq[pix1[11] - pix2[11]];
336
        s += sq[pix1[12] - pix2[12]];
337
        s += sq[pix1[13] - pix2[13]];
338
        s += sq[pix1[14] - pix2[14]];
339
        s += sq[pix1[15] - pix2[15]];
340

    
341
        pix1 += line_size;
342
        pix2 += line_size;
343
    }
344
    return s;
345
}
346

    
347

    
348
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
349
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
350
    int s, i, j;
351
    const int dec_count= w==8 ? 3 : 4;
352
    int tmp[32*32];
353
    int level, ori;
354
    static const int scale[2][2][4][4]={
355
      {
356
        {
357
            // 9/7 8x8 dec=3
358
            {268, 239, 239, 213},
359
            {  0, 224, 224, 152},
360
            {  0, 135, 135, 110},
361
        },{
362
            // 9/7 16x16 or 32x32 dec=4
363
            {344, 310, 310, 280},
364
            {  0, 320, 320, 228},
365
            {  0, 175, 175, 136},
366
            {  0, 129, 129, 102},
367
        }
368
      },{
369
        {
370
            // 5/3 8x8 dec=3
371
            {275, 245, 245, 218},
372
            {  0, 230, 230, 156},
373
            {  0, 138, 138, 113},
374
        },{
375
            // 5/3 16x16 or 32x32 dec=4
376
            {352, 317, 317, 286},
377
            {  0, 328, 328, 233},
378
            {  0, 180, 180, 140},
379
            {  0, 132, 132, 105},
380
        }
381
      }
382
    };
383

    
384
    for (i = 0; i < h; i++) {
385
        for (j = 0; j < w; j+=4) {
386
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
387
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
388
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
389
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
390
        }
391
        pix1 += line_size;
392
        pix2 += line_size;
393
    }
394

    
395
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
396

    
397
    s=0;
398
    assert(w==h);
399
    for(level=0; level<dec_count; level++){
400
        for(ori= level ? 1 : 0; ori<4; ori++){
401
            int size= w>>(dec_count-level);
402
            int sx= (ori&1) ? size : 0;
403
            int stride= 32<<(dec_count-level);
404
            int sy= (ori&2) ? stride>>1 : 0;
405

    
406
            for(i=0; i<size; i++){
407
                for(j=0; j<size; j++){
408
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
409
                    s += FFABS(v);
410
                }
411
            }
412
        }
413
    }
414
    assert(s>=0);
415
    return s>>9;
416
}
417

    
418
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
420
}
421

    
422
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
424
}
425

    
426
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
428
}
429

    
430
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
432
}
433

    
434
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
436
}
437

    
438
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
439
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
440
}
441
#endif
442

    
443
/* draw the edges of width 'w' of an image of size width, height */
444
//FIXME check that this is ok for mpeg4 interlaced
445
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
446
{
447
    uint8_t *ptr, *last_line;
448
    int i;
449

    
450
    last_line = buf + (height - 1) * wrap;
451
    for(i=0;i<w;i++) {
452
        /* top and bottom */
453
        memcpy(buf - (i + 1) * wrap, buf, width);
454
        memcpy(last_line + (i + 1) * wrap, last_line, width);
455
    }
456
    /* left and right */
457
    ptr = buf;
458
    for(i=0;i<height;i++) {
459
        memset(ptr - w, ptr[0], w);
460
        memset(ptr + width, ptr[width-1], w);
461
        ptr += wrap;
462
    }
463
    /* corners */
464
    for(i=0;i<w;i++) {
465
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
466
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
467
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
468
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
469
    }
470
}
471

    
472
/**
473
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
474
 * @param buf destination buffer
475
 * @param src source buffer
476
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
477
 * @param block_w width of block
478
 * @param block_h height of block
479
 * @param src_x x coordinate of the top left sample of the block in the source buffer
480
 * @param src_y y coordinate of the top left sample of the block in the source buffer
481
 * @param w width of the source buffer
482
 * @param h height of the source buffer
483
 */
484
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
485
                                    int src_x, int src_y, int w, int h){
486
    int x, y;
487
    int start_y, start_x, end_y, end_x;
488

    
489
    if(src_y>= h){
490
        src+= (h-1-src_y)*linesize;
491
        src_y=h-1;
492
    }else if(src_y<=-block_h){
493
        src+= (1-block_h-src_y)*linesize;
494
        src_y=1-block_h;
495
    }
496
    if(src_x>= w){
497
        src+= (w-1-src_x);
498
        src_x=w-1;
499
    }else if(src_x<=-block_w){
500
        src+= (1-block_w-src_x);
501
        src_x=1-block_w;
502
    }
503

    
504
    start_y= FFMAX(0, -src_y);
505
    start_x= FFMAX(0, -src_x);
506
    end_y= FFMIN(block_h, h-src_y);
507
    end_x= FFMIN(block_w, w-src_x);
508

    
509
    // copy existing part
510
    for(y=start_y; y<end_y; y++){
511
        for(x=start_x; x<end_x; x++){
512
            buf[x + y*linesize]= src[x + y*linesize];
513
        }
514
    }
515

    
516
    //top
517
    for(y=0; y<start_y; y++){
518
        for(x=start_x; x<end_x; x++){
519
            buf[x + y*linesize]= buf[x + start_y*linesize];
520
        }
521
    }
522

    
523
    //bottom
524
    for(y=end_y; y<block_h; y++){
525
        for(x=start_x; x<end_x; x++){
526
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
527
        }
528
    }
529

    
530
    for(y=0; y<block_h; y++){
531
       //left
532
        for(x=0; x<start_x; x++){
533
            buf[x + y*linesize]= buf[start_x + y*linesize];
534
        }
535

    
536
       //right
537
        for(x=end_x; x<block_w; x++){
538
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
539
        }
540
    }
541
}
542

    
543
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
544
{
545
    int i;
546

    
547
    /* read the pixels */
548
    for(i=0;i<8;i++) {
549
        block[0] = pixels[0];
550
        block[1] = pixels[1];
551
        block[2] = pixels[2];
552
        block[3] = pixels[3];
553
        block[4] = pixels[4];
554
        block[5] = pixels[5];
555
        block[6] = pixels[6];
556
        block[7] = pixels[7];
557
        pixels += line_size;
558
        block += 8;
559
    }
560
}
561

    
562
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
563
                          const uint8_t *s2, int stride){
564
    int i;
565

    
566
    /* read the pixels */
567
    for(i=0;i<8;i++) {
568
        block[0] = s1[0] - s2[0];
569
        block[1] = s1[1] - s2[1];
570
        block[2] = s1[2] - s2[2];
571
        block[3] = s1[3] - s2[3];
572
        block[4] = s1[4] - s2[4];
573
        block[5] = s1[5] - s2[5];
574
        block[6] = s1[6] - s2[6];
575
        block[7] = s1[7] - s2[7];
576
        s1 += stride;
577
        s2 += stride;
578
        block += 8;
579
    }
580
}
581

    
582

    
583
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
584
                                 int line_size)
585
{
586
    int i;
587
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
588

    
589
    /* read the pixels */
590
    for(i=0;i<8;i++) {
591
        pixels[0] = cm[block[0]];
592
        pixels[1] = cm[block[1]];
593
        pixels[2] = cm[block[2]];
594
        pixels[3] = cm[block[3]];
595
        pixels[4] = cm[block[4]];
596
        pixels[5] = cm[block[5]];
597
        pixels[6] = cm[block[6]];
598
        pixels[7] = cm[block[7]];
599

    
600
        pixels += line_size;
601
        block += 8;
602
    }
603
}
604

    
605
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
606
                                 int line_size)
607
{
608
    int i;
609
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
610

    
611
    /* read the pixels */
612
    for(i=0;i<4;i++) {
613
        pixels[0] = cm[block[0]];
614
        pixels[1] = cm[block[1]];
615
        pixels[2] = cm[block[2]];
616
        pixels[3] = cm[block[3]];
617

    
618
        pixels += line_size;
619
        block += 8;
620
    }
621
}
622

    
623
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
624
                                 int line_size)
625
{
626
    int i;
627
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
628

    
629
    /* read the pixels */
630
    for(i=0;i<2;i++) {
631
        pixels[0] = cm[block[0]];
632
        pixels[1] = cm[block[1]];
633

    
634
        pixels += line_size;
635
        block += 8;
636
    }
637
}
638

    
639
static void put_signed_pixels_clamped_c(const DCTELEM *block,
640
                                        uint8_t *restrict pixels,
641
                                        int line_size)
642
{
643
    int i, j;
644

    
645
    for (i = 0; i < 8; i++) {
646
        for (j = 0; j < 8; j++) {
647
            if (*block < -128)
648
                *pixels = 0;
649
            else if (*block > 127)
650
                *pixels = 255;
651
            else
652
                *pixels = (uint8_t)(*block + 128);
653
            block++;
654
            pixels++;
655
        }
656
        pixels += (line_size - 8);
657
    }
658
}
659

    
660
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
661
                          int line_size)
662
{
663
    int i;
664
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
665

    
666
    /* read the pixels */
667
    for(i=0;i<8;i++) {
668
        pixels[0] = cm[pixels[0] + block[0]];
669
        pixels[1] = cm[pixels[1] + block[1]];
670
        pixels[2] = cm[pixels[2] + block[2]];
671
        pixels[3] = cm[pixels[3] + block[3]];
672
        pixels[4] = cm[pixels[4] + block[4]];
673
        pixels[5] = cm[pixels[5] + block[5]];
674
        pixels[6] = cm[pixels[6] + block[6]];
675
        pixels[7] = cm[pixels[7] + block[7]];
676
        pixels += line_size;
677
        block += 8;
678
    }
679
}
680

    
681
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
682
                          int line_size)
683
{
684
    int i;
685
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
686

    
687
    /* read the pixels */
688
    for(i=0;i<4;i++) {
689
        pixels[0] = cm[pixels[0] + block[0]];
690
        pixels[1] = cm[pixels[1] + block[1]];
691
        pixels[2] = cm[pixels[2] + block[2]];
692
        pixels[3] = cm[pixels[3] + block[3]];
693
        pixels += line_size;
694
        block += 8;
695
    }
696
}
697

    
698
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
699
                          int line_size)
700
{
701
    int i;
702
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
703

    
704
    /* read the pixels */
705
    for(i=0;i<2;i++) {
706
        pixels[0] = cm[pixels[0] + block[0]];
707
        pixels[1] = cm[pixels[1] + block[1]];
708
        pixels += line_size;
709
        block += 8;
710
    }
711
}
712

    
713
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
714
{
715
    int i;
716
    for(i=0;i<8;i++) {
717
        pixels[0] += block[0];
718
        pixels[1] += block[1];
719
        pixels[2] += block[2];
720
        pixels[3] += block[3];
721
        pixels[4] += block[4];
722
        pixels[5] += block[5];
723
        pixels[6] += block[6];
724
        pixels[7] += block[7];
725
        pixels += line_size;
726
        block += 8;
727
    }
728
}
729

    
730
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
731
{
732
    int i;
733
    for(i=0;i<4;i++) {
734
        pixels[0] += block[0];
735
        pixels[1] += block[1];
736
        pixels[2] += block[2];
737
        pixels[3] += block[3];
738
        pixels += line_size;
739
        block += 4;
740
    }
741
}
742

    
743
static int sum_abs_dctelem_c(DCTELEM *block)
744
{
745
    int sum=0, i;
746
    for(i=0; i<64; i++)
747
        sum+= FFABS(block[i]);
748
    return sum;
749
}
750

    
751
#if 0
752

753
#define PIXOP2(OPNAME, OP) \
754
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
755
{\
756
    int i;\
757
    for(i=0; i<h; i++){\
758
        OP(*((uint64_t*)block), AV_RN64(pixels));\
759
        pixels+=line_size;\
760
        block +=line_size;\
761
    }\
762
}\
763
\
764
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
765
{\
766
    int i;\
767
    for(i=0; i<h; i++){\
768
        const uint64_t a= AV_RN64(pixels  );\
769
        const uint64_t b= AV_RN64(pixels+1);\
770
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
771
        pixels+=line_size;\
772
        block +=line_size;\
773
    }\
774
}\
775
\
776
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
777
{\
778
    int i;\
779
    for(i=0; i<h; i++){\
780
        const uint64_t a= AV_RN64(pixels  );\
781
        const uint64_t b= AV_RN64(pixels+1);\
782
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
783
        pixels+=line_size;\
784
        block +=line_size;\
785
    }\
786
}\
787
\
788
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
789
{\
790
    int i;\
791
    for(i=0; i<h; i++){\
792
        const uint64_t a= AV_RN64(pixels          );\
793
        const uint64_t b= AV_RN64(pixels+line_size);\
794
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
795
        pixels+=line_size;\
796
        block +=line_size;\
797
    }\
798
}\
799
\
800
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
801
{\
802
    int i;\
803
    for(i=0; i<h; i++){\
804
        const uint64_t a= AV_RN64(pixels          );\
805
        const uint64_t b= AV_RN64(pixels+line_size);\
806
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
807
        pixels+=line_size;\
808
        block +=line_size;\
809
    }\
810
}\
811
\
812
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
813
{\
814
        int i;\
815
        const uint64_t a= AV_RN64(pixels  );\
816
        const uint64_t b= AV_RN64(pixels+1);\
817
        uint64_t l0=  (a&0x0303030303030303ULL)\
818
                    + (b&0x0303030303030303ULL)\
819
                    + 0x0202020202020202ULL;\
820
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
821
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
822
        uint64_t l1,h1;\
823
\
824
        pixels+=line_size;\
825
        for(i=0; i<h; i+=2){\
826
            uint64_t a= AV_RN64(pixels  );\
827
            uint64_t b= AV_RN64(pixels+1);\
828
            l1=  (a&0x0303030303030303ULL)\
829
               + (b&0x0303030303030303ULL);\
830
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
831
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
832
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
833
            pixels+=line_size;\
834
            block +=line_size;\
835
            a= AV_RN64(pixels  );\
836
            b= AV_RN64(pixels+1);\
837
            l0=  (a&0x0303030303030303ULL)\
838
               + (b&0x0303030303030303ULL)\
839
               + 0x0202020202020202ULL;\
840
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
841
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
842
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
843
            pixels+=line_size;\
844
            block +=line_size;\
845
        }\
846
}\
847
\
848
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
849
{\
850
        int i;\
851
        const uint64_t a= AV_RN64(pixels  );\
852
        const uint64_t b= AV_RN64(pixels+1);\
853
        uint64_t l0=  (a&0x0303030303030303ULL)\
854
                    + (b&0x0303030303030303ULL)\
855
                    + 0x0101010101010101ULL;\
856
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
857
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
858
        uint64_t l1,h1;\
859
\
860
        pixels+=line_size;\
861
        for(i=0; i<h; i+=2){\
862
            uint64_t a= AV_RN64(pixels  );\
863
            uint64_t b= AV_RN64(pixels+1);\
864
            l1=  (a&0x0303030303030303ULL)\
865
               + (b&0x0303030303030303ULL);\
866
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
867
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
868
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
869
            pixels+=line_size;\
870
            block +=line_size;\
871
            a= AV_RN64(pixels  );\
872
            b= AV_RN64(pixels+1);\
873
            l0=  (a&0x0303030303030303ULL)\
874
               + (b&0x0303030303030303ULL)\
875
               + 0x0101010101010101ULL;\
876
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
877
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
878
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
879
            pixels+=line_size;\
880
            block +=line_size;\
881
        }\
882
}\
883
\
884
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
885
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
886
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
887
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
888
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
889
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
890
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
891

892
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
893
#else // 64 bit variant
894

    
895
#define PIXOP2(OPNAME, OP) \
896
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897
    int i;\
898
    for(i=0; i<h; i++){\
899
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
900
        pixels+=line_size;\
901
        block +=line_size;\
902
    }\
903
}\
904
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905
    int i;\
906
    for(i=0; i<h; i++){\
907
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
908
        pixels+=line_size;\
909
        block +=line_size;\
910
    }\
911
}\
912
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
913
    int i;\
914
    for(i=0; i<h; i++){\
915
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
916
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
917
        pixels+=line_size;\
918
        block +=line_size;\
919
    }\
920
}\
921
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
922
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
923
}\
924
\
925
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
926
                                                int src_stride1, int src_stride2, int h){\
927
    int i;\
928
    for(i=0; i<h; i++){\
929
        uint32_t a,b;\
930
        a= AV_RN32(&src1[i*src_stride1  ]);\
931
        b= AV_RN32(&src2[i*src_stride2  ]);\
932
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
933
        a= AV_RN32(&src1[i*src_stride1+4]);\
934
        b= AV_RN32(&src2[i*src_stride2+4]);\
935
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
936
    }\
937
}\
938
\
939
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
940
                                                int src_stride1, int src_stride2, int h){\
941
    int i;\
942
    for(i=0; i<h; i++){\
943
        uint32_t a,b;\
944
        a= AV_RN32(&src1[i*src_stride1  ]);\
945
        b= AV_RN32(&src2[i*src_stride2  ]);\
946
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
947
        a= AV_RN32(&src1[i*src_stride1+4]);\
948
        b= AV_RN32(&src2[i*src_stride2+4]);\
949
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
950
    }\
951
}\
952
\
953
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
954
                                                int src_stride1, int src_stride2, int h){\
955
    int i;\
956
    for(i=0; i<h; i++){\
957
        uint32_t a,b;\
958
        a= AV_RN32(&src1[i*src_stride1  ]);\
959
        b= AV_RN32(&src2[i*src_stride2  ]);\
960
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
961
    }\
962
}\
963
\
964
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
965
                                                int src_stride1, int src_stride2, int h){\
966
    int i;\
967
    for(i=0; i<h; i++){\
968
        uint32_t a,b;\
969
        a= AV_RN16(&src1[i*src_stride1  ]);\
970
        b= AV_RN16(&src2[i*src_stride2  ]);\
971
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
972
    }\
973
}\
974
\
975
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
976
                                                int src_stride1, int src_stride2, int h){\
977
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
978
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
979
}\
980
\
981
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
982
                                                int src_stride1, int src_stride2, int h){\
983
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
984
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
985
}\
986
\
987
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
989
}\
990
\
991
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
993
}\
994
\
995
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
997
}\
998
\
999
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1000
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1001
}\
1002
\
1003
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1004
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005
    int i;\
1006
    for(i=0; i<h; i++){\
1007
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1008
        a= AV_RN32(&src1[i*src_stride1]);\
1009
        b= AV_RN32(&src2[i*src_stride2]);\
1010
        c= AV_RN32(&src3[i*src_stride3]);\
1011
        d= AV_RN32(&src4[i*src_stride4]);\
1012
        l0=  (a&0x03030303UL)\
1013
           + (b&0x03030303UL)\
1014
           + 0x02020202UL;\
1015
        h0= ((a&0xFCFCFCFCUL)>>2)\
1016
          + ((b&0xFCFCFCFCUL)>>2);\
1017
        l1=  (c&0x03030303UL)\
1018
           + (d&0x03030303UL);\
1019
        h1= ((c&0xFCFCFCFCUL)>>2)\
1020
          + ((d&0xFCFCFCFCUL)>>2);\
1021
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1022
        a= AV_RN32(&src1[i*src_stride1+4]);\
1023
        b= AV_RN32(&src2[i*src_stride2+4]);\
1024
        c= AV_RN32(&src3[i*src_stride3+4]);\
1025
        d= AV_RN32(&src4[i*src_stride4+4]);\
1026
        l0=  (a&0x03030303UL)\
1027
           + (b&0x03030303UL)\
1028
           + 0x02020202UL;\
1029
        h0= ((a&0xFCFCFCFCUL)>>2)\
1030
          + ((b&0xFCFCFCFCUL)>>2);\
1031
        l1=  (c&0x03030303UL)\
1032
           + (d&0x03030303UL);\
1033
        h1= ((c&0xFCFCFCFCUL)>>2)\
1034
          + ((d&0xFCFCFCFCUL)>>2);\
1035
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1036
    }\
1037
}\
1038
\
1039
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1041
}\
1042
\
1043
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1045
}\
1046
\
1047
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1049
}\
1050
\
1051
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1052
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1053
}\
1054
\
1055
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1056
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1057
    int i;\
1058
    for(i=0; i<h; i++){\
1059
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1060
        a= AV_RN32(&src1[i*src_stride1]);\
1061
        b= AV_RN32(&src2[i*src_stride2]);\
1062
        c= AV_RN32(&src3[i*src_stride3]);\
1063
        d= AV_RN32(&src4[i*src_stride4]);\
1064
        l0=  (a&0x03030303UL)\
1065
           + (b&0x03030303UL)\
1066
           + 0x01010101UL;\
1067
        h0= ((a&0xFCFCFCFCUL)>>2)\
1068
          + ((b&0xFCFCFCFCUL)>>2);\
1069
        l1=  (c&0x03030303UL)\
1070
           + (d&0x03030303UL);\
1071
        h1= ((c&0xFCFCFCFCUL)>>2)\
1072
          + ((d&0xFCFCFCFCUL)>>2);\
1073
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1074
        a= AV_RN32(&src1[i*src_stride1+4]);\
1075
        b= AV_RN32(&src2[i*src_stride2+4]);\
1076
        c= AV_RN32(&src3[i*src_stride3+4]);\
1077
        d= AV_RN32(&src4[i*src_stride4+4]);\
1078
        l0=  (a&0x03030303UL)\
1079
           + (b&0x03030303UL)\
1080
           + 0x01010101UL;\
1081
        h0= ((a&0xFCFCFCFCUL)>>2)\
1082
          + ((b&0xFCFCFCFCUL)>>2);\
1083
        l1=  (c&0x03030303UL)\
1084
           + (d&0x03030303UL);\
1085
        h1= ((c&0xFCFCFCFCUL)>>2)\
1086
          + ((d&0xFCFCFCFCUL)>>2);\
1087
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088
    }\
1089
}\
1090
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094
}\
1095
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1096
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1097
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1098
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1099
}\
1100
\
1101
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1102
{\
1103
        int i, a0, b0, a1, b1;\
1104
        a0= pixels[0];\
1105
        b0= pixels[1] + 2;\
1106
        a0 += b0;\
1107
        b0 += pixels[2];\
1108
\
1109
        pixels+=line_size;\
1110
        for(i=0; i<h; i+=2){\
1111
            a1= pixels[0];\
1112
            b1= pixels[1];\
1113
            a1 += b1;\
1114
            b1 += pixels[2];\
1115
\
1116
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1117
            block[1]= (b1+b0)>>2;\
1118
\
1119
            pixels+=line_size;\
1120
            block +=line_size;\
1121
\
1122
            a0= pixels[0];\
1123
            b0= pixels[1] + 2;\
1124
            a0 += b0;\
1125
            b0 += pixels[2];\
1126
\
1127
            block[0]= (a1+a0)>>2;\
1128
            block[1]= (b1+b0)>>2;\
1129
            pixels+=line_size;\
1130
            block +=line_size;\
1131
        }\
1132
}\
1133
\
1134
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1135
{\
1136
        int i;\
1137
        const uint32_t a= AV_RN32(pixels  );\
1138
        const uint32_t b= AV_RN32(pixels+1);\
1139
        uint32_t l0=  (a&0x03030303UL)\
1140
                    + (b&0x03030303UL)\
1141
                    + 0x02020202UL;\
1142
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1143
                   + ((b&0xFCFCFCFCUL)>>2);\
1144
        uint32_t l1,h1;\
1145
\
1146
        pixels+=line_size;\
1147
        for(i=0; i<h; i+=2){\
1148
            uint32_t a= AV_RN32(pixels  );\
1149
            uint32_t b= AV_RN32(pixels+1);\
1150
            l1=  (a&0x03030303UL)\
1151
               + (b&0x03030303UL);\
1152
            h1= ((a&0xFCFCFCFCUL)>>2)\
1153
              + ((b&0xFCFCFCFCUL)>>2);\
1154
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1155
            pixels+=line_size;\
1156
            block +=line_size;\
1157
            a= AV_RN32(pixels  );\
1158
            b= AV_RN32(pixels+1);\
1159
            l0=  (a&0x03030303UL)\
1160
               + (b&0x03030303UL)\
1161
               + 0x02020202UL;\
1162
            h0= ((a&0xFCFCFCFCUL)>>2)\
1163
              + ((b&0xFCFCFCFCUL)>>2);\
1164
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1165
            pixels+=line_size;\
1166
            block +=line_size;\
1167
        }\
1168
}\
1169
\
1170
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1171
{\
1172
    int j;\
1173
    for(j=0; j<2; j++){\
1174
        int i;\
1175
        const uint32_t a= AV_RN32(pixels  );\
1176
        const uint32_t b= AV_RN32(pixels+1);\
1177
        uint32_t l0=  (a&0x03030303UL)\
1178
                    + (b&0x03030303UL)\
1179
                    + 0x02020202UL;\
1180
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1181
                   + ((b&0xFCFCFCFCUL)>>2);\
1182
        uint32_t l1,h1;\
1183
\
1184
        pixels+=line_size;\
1185
        for(i=0; i<h; i+=2){\
1186
            uint32_t a= AV_RN32(pixels  );\
1187
            uint32_t b= AV_RN32(pixels+1);\
1188
            l1=  (a&0x03030303UL)\
1189
               + (b&0x03030303UL);\
1190
            h1= ((a&0xFCFCFCFCUL)>>2)\
1191
              + ((b&0xFCFCFCFCUL)>>2);\
1192
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1193
            pixels+=line_size;\
1194
            block +=line_size;\
1195
            a= AV_RN32(pixels  );\
1196
            b= AV_RN32(pixels+1);\
1197
            l0=  (a&0x03030303UL)\
1198
               + (b&0x03030303UL)\
1199
               + 0x02020202UL;\
1200
            h0= ((a&0xFCFCFCFCUL)>>2)\
1201
              + ((b&0xFCFCFCFCUL)>>2);\
1202
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1203
            pixels+=line_size;\
1204
            block +=line_size;\
1205
        }\
1206
        pixels+=4-line_size*(h+1);\
1207
        block +=4-line_size*h;\
1208
    }\
1209
}\
1210
\
1211
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1212
{\
1213
    int j;\
1214
    for(j=0; j<2; j++){\
1215
        int i;\
1216
        const uint32_t a= AV_RN32(pixels  );\
1217
        const uint32_t b= AV_RN32(pixels+1);\
1218
        uint32_t l0=  (a&0x03030303UL)\
1219
                    + (b&0x03030303UL)\
1220
                    + 0x01010101UL;\
1221
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1222
                   + ((b&0xFCFCFCFCUL)>>2);\
1223
        uint32_t l1,h1;\
1224
\
1225
        pixels+=line_size;\
1226
        for(i=0; i<h; i+=2){\
1227
            uint32_t a= AV_RN32(pixels  );\
1228
            uint32_t b= AV_RN32(pixels+1);\
1229
            l1=  (a&0x03030303UL)\
1230
               + (b&0x03030303UL);\
1231
            h1= ((a&0xFCFCFCFCUL)>>2)\
1232
              + ((b&0xFCFCFCFCUL)>>2);\
1233
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1234
            pixels+=line_size;\
1235
            block +=line_size;\
1236
            a= AV_RN32(pixels  );\
1237
            b= AV_RN32(pixels+1);\
1238
            l0=  (a&0x03030303UL)\
1239
               + (b&0x03030303UL)\
1240
               + 0x01010101UL;\
1241
            h0= ((a&0xFCFCFCFCUL)>>2)\
1242
              + ((b&0xFCFCFCFCUL)>>2);\
1243
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1244
            pixels+=line_size;\
1245
            block +=line_size;\
1246
        }\
1247
        pixels+=4-line_size*(h+1);\
1248
        block +=4-line_size*h;\
1249
    }\
1250
}\
1251
\
1252
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1255
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1256
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1257
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1258
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1259
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1260

    
1261
#define op_avg(a, b) a = rnd_avg32(a, b)
1262
#endif
1263
#define op_put(a, b) a = b
1264

    
1265
PIXOP2(avg, op_avg)
1266
PIXOP2(put, op_put)
1267
#undef op_avg
1268
#undef op_put
1269

    
1270
#define avg2(a,b) ((a+b+1)>>1)
1271
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1272

    
1273
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1275
}
1276

    
1277
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1278
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1279
}
1280

    
1281
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1282
{
1283
    const int A=(16-x16)*(16-y16);
1284
    const int B=(   x16)*(16-y16);
1285
    const int C=(16-x16)*(   y16);
1286
    const int D=(   x16)*(   y16);
1287
    int i;
1288

    
1289
    for(i=0; i<h; i++)
1290
    {
1291
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1292
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1293
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1294
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1295
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1296
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1297
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1298
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1299
        dst+= stride;
1300
        src+= stride;
1301
    }
1302
}
1303

    
1304
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1305
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1306
{
1307
    int y, vx, vy;
1308
    const int s= 1<<shift;
1309

    
1310
    width--;
1311
    height--;
1312

    
1313
    for(y=0; y<h; y++){
1314
        int x;
1315

    
1316
        vx= ox;
1317
        vy= oy;
1318
        for(x=0; x<8; x++){ //XXX FIXME optimize
1319
            int src_x, src_y, frac_x, frac_y, index;
1320

    
1321
            src_x= vx>>16;
1322
            src_y= vy>>16;
1323
            frac_x= src_x&(s-1);
1324
            frac_y= src_y&(s-1);
1325
            src_x>>=shift;
1326
            src_y>>=shift;
1327

    
1328
            if((unsigned)src_x < width){
1329
                if((unsigned)src_y < height){
1330
                    index= src_x + src_y*stride;
1331
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1332
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1333
                                        + (  src[index+stride  ]*(s-frac_x)
1334
                                           + src[index+stride+1]*   frac_x )*   frac_y
1335
                                        + r)>>(shift*2);
1336
                }else{
1337
                    index= src_x + av_clip(src_y, 0, height)*stride;
1338
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1339
                                          + src[index       +1]*   frac_x )*s
1340
                                        + r)>>(shift*2);
1341
                }
1342
            }else{
1343
                if((unsigned)src_y < height){
1344
                    index= av_clip(src_x, 0, width) + src_y*stride;
1345
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1346
                                           + src[index+stride  ]*   frac_y )*s
1347
                                        + r)>>(shift*2);
1348
                }else{
1349
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1350
                    dst[y*stride + x]=    src[index         ];
1351
                }
1352
            }
1353

    
1354
            vx+= dxx;
1355
            vy+= dyx;
1356
        }
1357
        ox += dxy;
1358
        oy += dyy;
1359
    }
1360
}
1361

    
1362
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363
    switch(width){
1364
    case 2: put_pixels2_c (dst, src, stride, height); break;
1365
    case 4: put_pixels4_c (dst, src, stride, height); break;
1366
    case 8: put_pixels8_c (dst, src, stride, height); break;
1367
    case 16:put_pixels16_c(dst, src, stride, height); break;
1368
    }
1369
}
1370

    
1371
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372
    int i,j;
1373
    for (i=0; i < height; i++) {
1374
      for (j=0; j < width; j++) {
1375
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1376
      }
1377
      src += stride;
1378
      dst += stride;
1379
    }
1380
}
1381

    
1382
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383
    int i,j;
1384
    for (i=0; i < height; i++) {
1385
      for (j=0; j < width; j++) {
1386
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1387
      }
1388
      src += stride;
1389
      dst += stride;
1390
    }
1391
}
1392

    
1393
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394
    int i,j;
1395
    for (i=0; i < height; i++) {
1396
      for (j=0; j < width; j++) {
1397
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1398
      }
1399
      src += stride;
1400
      dst += stride;
1401
    }
1402
}
1403

    
1404
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405
    int i,j;
1406
    for (i=0; i < height; i++) {
1407
      for (j=0; j < width; j++) {
1408
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1409
      }
1410
      src += stride;
1411
      dst += stride;
1412
    }
1413
}
1414

    
1415
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416
    int i,j;
1417
    for (i=0; i < height; i++) {
1418
      for (j=0; j < width; j++) {
1419
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1420
      }
1421
      src += stride;
1422
      dst += stride;
1423
    }
1424
}
1425

    
1426
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1427
    int i,j;
1428
    for (i=0; i < height; i++) {
1429
      for (j=0; j < width; j++) {
1430
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1431
      }
1432
      src += stride;
1433
      dst += stride;
1434
    }
1435
}
1436

    
1437
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1438
    int i,j;
1439
    for (i=0; i < height; i++) {
1440
      for (j=0; j < width; j++) {
1441
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1442
      }
1443
      src += stride;
1444
      dst += stride;
1445
    }
1446
}
1447

    
1448
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1449
    int i,j;
1450
    for (i=0; i < height; i++) {
1451
      for (j=0; j < width; j++) {
1452
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1453
      }
1454
      src += stride;
1455
      dst += stride;
1456
    }
1457
}
1458

    
1459
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1460
    switch(width){
1461
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1462
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1463
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1464
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1465
    }
1466
}
1467

    
1468
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469
    int i,j;
1470
    for (i=0; i < height; i++) {
1471
      for (j=0; j < width; j++) {
1472
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1473
      }
1474
      src += stride;
1475
      dst += stride;
1476
    }
1477
}
1478

    
1479
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480
    int i,j;
1481
    for (i=0; i < height; i++) {
1482
      for (j=0; j < width; j++) {
1483
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1484
      }
1485
      src += stride;
1486
      dst += stride;
1487
    }
1488
}
1489

    
1490
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491
    int i,j;
1492
    for (i=0; i < height; i++) {
1493
      for (j=0; j < width; j++) {
1494
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1495
      }
1496
      src += stride;
1497
      dst += stride;
1498
    }
1499
}
1500

    
1501
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502
    int i,j;
1503
    for (i=0; i < height; i++) {
1504
      for (j=0; j < width; j++) {
1505
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1506
      }
1507
      src += stride;
1508
      dst += stride;
1509
    }
1510
}
1511

    
1512
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513
    int i,j;
1514
    for (i=0; i < height; i++) {
1515
      for (j=0; j < width; j++) {
1516
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1517
      }
1518
      src += stride;
1519
      dst += stride;
1520
    }
1521
}
1522

    
1523
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1524
    int i,j;
1525
    for (i=0; i < height; i++) {
1526
      for (j=0; j < width; j++) {
1527
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1528
      }
1529
      src += stride;
1530
      dst += stride;
1531
    }
1532
}
1533

    
1534
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1535
    int i,j;
1536
    for (i=0; i < height; i++) {
1537
      for (j=0; j < width; j++) {
1538
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1539
      }
1540
      src += stride;
1541
      dst += stride;
1542
    }
1543
}
1544

    
1545
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1546
    int i,j;
1547
    for (i=0; i < height; i++) {
1548
      for (j=0; j < width; j++) {
1549
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1550
      }
1551
      src += stride;
1552
      dst += stride;
1553
    }
1554
}
1555
#if 0
1556
#define TPEL_WIDTH(width)\
1557
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1559
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1561
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1563
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1565
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1567
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1569
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1571
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1572
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1573
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1574
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1575
#endif
1576

    
1577
#define H264_CHROMA_MC(OPNAME, OP)\
1578
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1579
    const int A=(8-x)*(8-y);\
1580
    const int B=(  x)*(8-y);\
1581
    const int C=(8-x)*(  y);\
1582
    const int D=(  x)*(  y);\
1583
    int i;\
1584
    \
1585
    assert(x<8 && y<8 && x>=0 && y>=0);\
1586
\
1587
    if(D){\
1588
        for(i=0; i<h; i++){\
1589
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1590
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1591
            dst+= stride;\
1592
            src+= stride;\
1593
        }\
1594
    }else{\
1595
        const int E= B+C;\
1596
        const int step= C ? stride : 1;\
1597
        for(i=0; i<h; i++){\
1598
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1599
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1600
            dst+= stride;\
1601
            src+= stride;\
1602
        }\
1603
    }\
1604
}\
1605
\
1606
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1607
    const int A=(8-x)*(8-y);\
1608
    const int B=(  x)*(8-y);\
1609
    const int C=(8-x)*(  y);\
1610
    const int D=(  x)*(  y);\
1611
    int i;\
1612
    \
1613
    assert(x<8 && y<8 && x>=0 && y>=0);\
1614
\
1615
    if(D){\
1616
        for(i=0; i<h; i++){\
1617
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1618
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1619
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1620
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1621
            dst+= stride;\
1622
            src+= stride;\
1623
        }\
1624
    }else{\
1625
        const int E= B+C;\
1626
        const int step= C ? stride : 1;\
1627
        for(i=0; i<h; i++){\
1628
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1629
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1630
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1631
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1632
            dst+= stride;\
1633
            src+= stride;\
1634
        }\
1635
    }\
1636
}\
1637
\
1638
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1639
    const int A=(8-x)*(8-y);\
1640
    const int B=(  x)*(8-y);\
1641
    const int C=(8-x)*(  y);\
1642
    const int D=(  x)*(  y);\
1643
    int i;\
1644
    \
1645
    assert(x<8 && y<8 && x>=0 && y>=0);\
1646
\
1647
    if(D){\
1648
        for(i=0; i<h; i++){\
1649
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1650
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1651
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1652
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1653
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1654
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1655
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1656
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1657
            dst+= stride;\
1658
            src+= stride;\
1659
        }\
1660
    }else{\
1661
        const int E= B+C;\
1662
        const int step= C ? stride : 1;\
1663
        for(i=0; i<h; i++){\
1664
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1665
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1666
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1667
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1668
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1669
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1670
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1671
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1672
            dst+= stride;\
1673
            src+= stride;\
1674
        }\
1675
    }\
1676
}
1677

    
1678
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1679
#define op_put(a, b) a = (((b) + 32)>>6)
1680

    
1681
H264_CHROMA_MC(put_       , op_put)
1682
H264_CHROMA_MC(avg_       , op_avg)
1683
#undef op_avg
1684
#undef op_put
1685

    
1686
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1687
    const int A=(8-x)*(8-y);
1688
    const int B=(  x)*(8-y);
1689
    const int C=(8-x)*(  y);
1690
    const int D=(  x)*(  y);
1691
    int i;
1692

    
1693
    assert(x<8 && y<8 && x>=0 && y>=0);
1694

    
1695
    for(i=0; i<h; i++)
1696
    {
1697
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1698
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1699
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1700
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1701
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1702
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1703
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1704
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1705
        dst+= stride;
1706
        src+= stride;
1707
    }
1708
}
1709

    
1710
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1711
    const int A=(8-x)*(8-y);
1712
    const int B=(  x)*(8-y);
1713
    const int C=(8-x)*(  y);
1714
    const int D=(  x)*(  y);
1715
    int i;
1716

    
1717
    assert(x<8 && y<8 && x>=0 && y>=0);
1718

    
1719
    for(i=0; i<h; i++)
1720
    {
1721
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1722
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1723
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1724
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1725
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1726
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1727
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1728
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1729
        dst+= stride;
1730
        src+= stride;
1731
    }
1732
}
1733

    
1734
#define QPEL_MC(r, OPNAME, RND, OP) \
1735
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1736
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1737
    int i;\
1738
    for(i=0; i<h; i++)\
1739
    {\
1740
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1741
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1742
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1743
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1744
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1745
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1746
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1747
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1748
        dst+=dstStride;\
1749
        src+=srcStride;\
1750
    }\
1751
}\
1752
\
1753
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754
    const int w=8;\
1755
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1756
    int i;\
1757
    for(i=0; i<w; i++)\
1758
    {\
1759
        const int src0= src[0*srcStride];\
1760
        const int src1= src[1*srcStride];\
1761
        const int src2= src[2*srcStride];\
1762
        const int src3= src[3*srcStride];\
1763
        const int src4= src[4*srcStride];\
1764
        const int src5= src[5*srcStride];\
1765
        const int src6= src[6*srcStride];\
1766
        const int src7= src[7*srcStride];\
1767
        const int src8= src[8*srcStride];\
1768
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1769
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1770
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1771
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1772
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1773
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1774
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1775
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1776
        dst++;\
1777
        src++;\
1778
    }\
1779
}\
1780
\
1781
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1782
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1783
    int i;\
1784
    \
1785
    for(i=0; i<h; i++)\
1786
    {\
1787
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1788
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1789
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1790
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1791
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1792
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1793
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1794
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1795
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1796
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1797
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1798
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1799
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1800
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1801
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1802
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1803
        dst+=dstStride;\
1804
        src+=srcStride;\
1805
    }\
1806
}\
1807
\
1808
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1809
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1810
    int i;\
1811
    const int w=16;\
1812
    for(i=0; i<w; i++)\
1813
    {\
1814
        const int src0= src[0*srcStride];\
1815
        const int src1= src[1*srcStride];\
1816
        const int src2= src[2*srcStride];\
1817
        const int src3= src[3*srcStride];\
1818
        const int src4= src[4*srcStride];\
1819
        const int src5= src[5*srcStride];\
1820
        const int src6= src[6*srcStride];\
1821
        const int src7= src[7*srcStride];\
1822
        const int src8= src[8*srcStride];\
1823
        const int src9= src[9*srcStride];\
1824
        const int src10= src[10*srcStride];\
1825
        const int src11= src[11*srcStride];\
1826
        const int src12= src[12*srcStride];\
1827
        const int src13= src[13*srcStride];\
1828
        const int src14= src[14*srcStride];\
1829
        const int src15= src[15*srcStride];\
1830
        const int src16= src[16*srcStride];\
1831
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1832
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1833
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1834
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1835
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1836
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1837
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1838
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1839
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1840
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1841
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1842
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1843
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1844
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1845
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1846
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1847
        dst++;\
1848
        src++;\
1849
    }\
1850
}\
1851
\
1852
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1853
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1854
}\
1855
\
1856
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1857
    uint8_t half[64];\
1858
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1859
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1860
}\
1861
\
1862
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1863
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1864
}\
1865
\
1866
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1867
    uint8_t half[64];\
1868
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1869
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1870
}\
1871
\
1872
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1873
    uint8_t full[16*9];\
1874
    uint8_t half[64];\
1875
    copy_block9(full, src, 16, stride, 9);\
1876
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1877
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1878
}\
1879
\
1880
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1881
    uint8_t full[16*9];\
1882
    copy_block9(full, src, 16, stride, 9);\
1883
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1884
}\
1885
\
1886
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t full[16*9];\
1888
    uint8_t half[64];\
1889
    copy_block9(full, src, 16, stride, 9);\
1890
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1891
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1892
}\
1893
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1894
    uint8_t full[16*9];\
1895
    uint8_t halfH[72];\
1896
    uint8_t halfV[64];\
1897
    uint8_t halfHV[64];\
1898
    copy_block9(full, src, 16, stride, 9);\
1899
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1900
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1901
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1902
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1903
}\
1904
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1905
    uint8_t full[16*9];\
1906
    uint8_t halfH[72];\
1907
    uint8_t halfHV[64];\
1908
    copy_block9(full, src, 16, stride, 9);\
1909
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1910
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1911
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1912
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1913
}\
1914
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1915
    uint8_t full[16*9];\
1916
    uint8_t halfH[72];\
1917
    uint8_t halfV[64];\
1918
    uint8_t halfHV[64];\
1919
    copy_block9(full, src, 16, stride, 9);\
1920
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1921
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1922
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1923
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1924
}\
1925
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1926
    uint8_t full[16*9];\
1927
    uint8_t halfH[72];\
1928
    uint8_t halfHV[64];\
1929
    copy_block9(full, src, 16, stride, 9);\
1930
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1931
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1932
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1933
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1934
}\
1935
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1936
    uint8_t full[16*9];\
1937
    uint8_t halfH[72];\
1938
    uint8_t halfV[64];\
1939
    uint8_t halfHV[64];\
1940
    copy_block9(full, src, 16, stride, 9);\
1941
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1942
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1943
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1944
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1945
}\
1946
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1947
    uint8_t full[16*9];\
1948
    uint8_t halfH[72];\
1949
    uint8_t halfHV[64];\
1950
    copy_block9(full, src, 16, stride, 9);\
1951
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1955
}\
1956
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t full[16*9];\
1958
    uint8_t halfH[72];\
1959
    uint8_t halfV[64];\
1960
    uint8_t halfHV[64];\
1961
    copy_block9(full, src, 16, stride, 9);\
1962
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1964
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1966
}\
1967
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1968
    uint8_t full[16*9];\
1969
    uint8_t halfH[72];\
1970
    uint8_t halfHV[64];\
1971
    copy_block9(full, src, 16, stride, 9);\
1972
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1973
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1974
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1975
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1976
}\
1977
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1978
    uint8_t halfH[72];\
1979
    uint8_t halfHV[64];\
1980
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1981
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1982
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1983
}\
1984
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1985
    uint8_t halfH[72];\
1986
    uint8_t halfHV[64];\
1987
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1988
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1989
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1990
}\
1991
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1992
    uint8_t full[16*9];\
1993
    uint8_t halfH[72];\
1994
    uint8_t halfV[64];\
1995
    uint8_t halfHV[64];\
1996
    copy_block9(full, src, 16, stride, 9);\
1997
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1999
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2000
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2001
}\
2002
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2003
    uint8_t full[16*9];\
2004
    uint8_t halfH[72];\
2005
    copy_block9(full, src, 16, stride, 9);\
2006
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2007
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2008
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2009
}\
2010
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2011
    uint8_t full[16*9];\
2012
    uint8_t halfH[72];\
2013
    uint8_t halfV[64];\
2014
    uint8_t halfHV[64];\
2015
    copy_block9(full, src, 16, stride, 9);\
2016
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2017
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2018
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2019
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2020
}\
2021
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2022
    uint8_t full[16*9];\
2023
    uint8_t halfH[72];\
2024
    copy_block9(full, src, 16, stride, 9);\
2025
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2026
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2027
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2028
}\
2029
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2030
    uint8_t halfH[72];\
2031
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2032
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2033
}\
2034
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2035
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2036
}\
2037
\
2038
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2039
    uint8_t half[256];\
2040
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2041
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2042
}\
2043
\
2044
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2045
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2046
}\
2047
\
2048
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2049
    uint8_t half[256];\
2050
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2051
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2052
}\
2053
\
2054
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2055
    uint8_t full[24*17];\
2056
    uint8_t half[256];\
2057
    copy_block17(full, src, 24, stride, 17);\
2058
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2059
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2060
}\
2061
\
2062
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2063
    uint8_t full[24*17];\
2064
    copy_block17(full, src, 24, stride, 17);\
2065
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2066
}\
2067
\
2068
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2069
    uint8_t full[24*17];\
2070
    uint8_t half[256];\
2071
    copy_block17(full, src, 24, stride, 17);\
2072
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2073
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2074
}\
2075
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2076
    uint8_t full[24*17];\
2077
    uint8_t halfH[272];\
2078
    uint8_t halfV[256];\
2079
    uint8_t halfHV[256];\
2080
    copy_block17(full, src, 24, stride, 17);\
2081
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2082
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2083
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2084
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2085
}\
2086
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087
    uint8_t full[24*17];\
2088
    uint8_t halfH[272];\
2089
    uint8_t halfHV[256];\
2090
    copy_block17(full, src, 24, stride, 17);\
2091
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2093
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2094
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2095
}\
2096
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2097
    uint8_t full[24*17];\
2098
    uint8_t halfH[272];\
2099
    uint8_t halfV[256];\
2100
    uint8_t halfHV[256];\
2101
    copy_block17(full, src, 24, stride, 17);\
2102
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2103
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2104
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2105
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2106
}\
2107
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2108
    uint8_t full[24*17];\
2109
    uint8_t halfH[272];\
2110
    uint8_t halfHV[256];\
2111
    copy_block17(full, src, 24, stride, 17);\
2112
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2113
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2114
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2115
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2116
}\
2117
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2118
    uint8_t full[24*17];\
2119
    uint8_t halfH[272];\
2120
    uint8_t halfV[256];\
2121
    uint8_t halfHV[256];\
2122
    copy_block17(full, src, 24, stride, 17);\
2123
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2124
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2125
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2126
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2127
}\
2128
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2129
    uint8_t full[24*17];\
2130
    uint8_t halfH[272];\
2131
    uint8_t halfHV[256];\
2132
    copy_block17(full, src, 24, stride, 17);\
2133
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2137
}\
2138
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139
    uint8_t full[24*17];\
2140
    uint8_t halfH[272];\
2141
    uint8_t halfV[256];\
2142
    uint8_t halfHV[256];\
2143
    copy_block17(full, src, 24, stride, 17);\
2144
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2146
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2148
}\
2149
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2150
    uint8_t full[24*17];\
2151
    uint8_t halfH[272];\
2152
    uint8_t halfHV[256];\
2153
    copy_block17(full, src, 24, stride, 17);\
2154
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2155
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2156
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2157
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2158
}\
2159
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2160
    uint8_t halfH[272];\
2161
    uint8_t halfHV[256];\
2162
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2163
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2164
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2165
}\
2166
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2167
    uint8_t halfH[272];\
2168
    uint8_t halfHV[256];\
2169
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2170
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2171
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2172
}\
2173
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2174
    uint8_t full[24*17];\
2175
    uint8_t halfH[272];\
2176
    uint8_t halfV[256];\
2177
    uint8_t halfHV[256];\
2178
    copy_block17(full, src, 24, stride, 17);\
2179
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2181
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2182
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2183
}\
2184
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2185
    uint8_t full[24*17];\
2186
    uint8_t halfH[272];\
2187
    copy_block17(full, src, 24, stride, 17);\
2188
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2189
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2190
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2191
}\
2192
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2193
    uint8_t full[24*17];\
2194
    uint8_t halfH[272];\
2195
    uint8_t halfV[256];\
2196
    uint8_t halfHV[256];\
2197
    copy_block17(full, src, 24, stride, 17);\
2198
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2199
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2200
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2201
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2202
}\
2203
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2204
    uint8_t full[24*17];\
2205
    uint8_t halfH[272];\
2206
    copy_block17(full, src, 24, stride, 17);\
2207
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2208
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2209
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2210
}\
2211
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2212
    uint8_t halfH[272];\
2213
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2214
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2215
}
2216

    
2217
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2218
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2219
#define op_put(a, b) a = cm[((b) + 16)>>5]
2220
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2221

    
2222
QPEL_MC(0, put_       , _       , op_put)
2223
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2224
QPEL_MC(0, avg_       , _       , op_avg)
2225
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2226
#undef op_avg
2227
#undef op_avg_no_rnd
2228
#undef op_put
2229
#undef op_put_no_rnd
2230

    
2231
#if 1
2232
#define H264_LOWPASS(OPNAME, OP, OP2) \
2233
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2234
    const int h=2;\
2235
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2236
    int i;\
2237
    for(i=0; i<h; i++)\
2238
    {\
2239
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2240
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2241
        dst+=dstStride;\
2242
        src+=srcStride;\
2243
    }\
2244
}\
2245
\
2246
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2247
    const int w=2;\
2248
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2249
    int i;\
2250
    for(i=0; i<w; i++)\
2251
    {\
2252
        const int srcB= src[-2*srcStride];\
2253
        const int srcA= src[-1*srcStride];\
2254
        const int src0= src[0 *srcStride];\
2255
        const int src1= src[1 *srcStride];\
2256
        const int src2= src[2 *srcStride];\
2257
        const int src3= src[3 *srcStride];\
2258
        const int src4= src[4 *srcStride];\
2259
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2260
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2261
        dst++;\
2262
        src++;\
2263
    }\
2264
}\
2265
\
2266
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2267
    const int h=2;\
2268
    const int w=2;\
2269
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270
    int i;\
2271
    src -= 2*srcStride;\
2272
    for(i=0; i<h+5; i++)\
2273
    {\
2274
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2275
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2276
        tmp+=tmpStride;\
2277
        src+=srcStride;\
2278
    }\
2279
    tmp -= tmpStride*(h+5-2);\
2280
    for(i=0; i<w; i++)\
2281
    {\
2282
        const int tmpB= tmp[-2*tmpStride];\
2283
        const int tmpA= tmp[-1*tmpStride];\
2284
        const int tmp0= tmp[0 *tmpStride];\
2285
        const int tmp1= tmp[1 *tmpStride];\
2286
        const int tmp2= tmp[2 *tmpStride];\
2287
        const int tmp3= tmp[3 *tmpStride];\
2288
        const int tmp4= tmp[4 *tmpStride];\
2289
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2290
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2291
        dst++;\
2292
        tmp++;\
2293
    }\
2294
}\
2295
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2296
    const int h=4;\
2297
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2298
    int i;\
2299
    for(i=0; i<h; i++)\
2300
    {\
2301
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2302
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2303
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2304
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2305
        dst+=dstStride;\
2306
        src+=srcStride;\
2307
    }\
2308
}\
2309
\
2310
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2311
    const int w=4;\
2312
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2313
    int i;\
2314
    for(i=0; i<w; i++)\
2315
    {\
2316
        const int srcB= src[-2*srcStride];\
2317
        const int srcA= src[-1*srcStride];\
2318
        const int src0= src[0 *srcStride];\
2319
        const int src1= src[1 *srcStride];\
2320
        const int src2= src[2 *srcStride];\
2321
        const int src3= src[3 *srcStride];\
2322
        const int src4= src[4 *srcStride];\
2323
        const int src5= src[5 *srcStride];\
2324
        const int src6= src[6 *srcStride];\
2325
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2326
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2327
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2328
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2329
        dst++;\
2330
        src++;\
2331
    }\
2332
}\
2333
\
2334
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2335
    const int h=4;\
2336
    const int w=4;\
2337
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338
    int i;\
2339
    src -= 2*srcStride;\
2340
    for(i=0; i<h+5; i++)\
2341
    {\
2342
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2343
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2344
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2345
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2346
        tmp+=tmpStride;\
2347
        src+=srcStride;\
2348
    }\
2349
    tmp -= tmpStride*(h+5-2);\
2350
    for(i=0; i<w; i++)\
2351
    {\
2352
        const int tmpB= tmp[-2*tmpStride];\
2353
        const int tmpA= tmp[-1*tmpStride];\
2354
        const int tmp0= tmp[0 *tmpStride];\
2355
        const int tmp1= tmp[1 *tmpStride];\
2356
        const int tmp2= tmp[2 *tmpStride];\
2357
        const int tmp3= tmp[3 *tmpStride];\
2358
        const int tmp4= tmp[4 *tmpStride];\
2359
        const int tmp5= tmp[5 *tmpStride];\
2360
        const int tmp6= tmp[6 *tmpStride];\
2361
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2362
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2363
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2364
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2365
        dst++;\
2366
        tmp++;\
2367
    }\
2368
}\
2369
\
2370
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2371
    const int h=8;\
2372
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2373
    int i;\
2374
    for(i=0; i<h; i++)\
2375
    {\
2376
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2377
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2378
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2379
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2380
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2381
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2382
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2383
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2384
        dst+=dstStride;\
2385
        src+=srcStride;\
2386
    }\
2387
}\
2388
\
2389
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2390
    const int w=8;\
2391
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2392
    int i;\
2393
    for(i=0; i<w; i++)\
2394
    {\
2395
        const int srcB= src[-2*srcStride];\
2396
        const int srcA= src[-1*srcStride];\
2397
        const int src0= src[0 *srcStride];\
2398
        const int src1= src[1 *srcStride];\
2399
        const int src2= src[2 *srcStride];\
2400
        const int src3= src[3 *srcStride];\
2401
        const int src4= src[4 *srcStride];\
2402
        const int src5= src[5 *srcStride];\
2403
        const int src6= src[6 *srcStride];\
2404
        const int src7= src[7 *srcStride];\
2405
        const int src8= src[8 *srcStride];\
2406
        const int src9= src[9 *srcStride];\
2407
        const int src10=src[10*srcStride];\
2408
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2409
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2410
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2411
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2412
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2413
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2414
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2415
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2416
        dst++;\
2417
        src++;\
2418
    }\
2419
}\
2420
\
2421
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2422
    const int h=8;\
2423
    const int w=8;\
2424
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2425
    int i;\
2426
    src -= 2*srcStride;\
2427
    for(i=0; i<h+5; i++)\
2428
    {\
2429
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2430
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2431
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2432
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2433
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2434
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2435
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2436
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2437
        tmp+=tmpStride;\
2438
        src+=srcStride;\
2439
    }\
2440
    tmp -= tmpStride*(h+5-2);\
2441
    for(i=0; i<w; i++)\
2442
    {\
2443
        const int tmpB= tmp[-2*tmpStride];\
2444
        const int tmpA= tmp[-1*tmpStride];\
2445
        const int tmp0= tmp[0 *tmpStride];\
2446
        const int tmp1= tmp[1 *tmpStride];\
2447
        const int tmp2= tmp[2 *tmpStride];\
2448
        const int tmp3= tmp[3 *tmpStride];\
2449
        const int tmp4= tmp[4 *tmpStride];\
2450
        const int tmp5= tmp[5 *tmpStride];\
2451
        const int tmp6= tmp[6 *tmpStride];\
2452
        const int tmp7= tmp[7 *tmpStride];\
2453
        const int tmp8= tmp[8 *tmpStride];\
2454
        const int tmp9= tmp[9 *tmpStride];\
2455
        const int tmp10=tmp[10*tmpStride];\
2456
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2457
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2458
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2459
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2460
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2461
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2462
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2463
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2464
        dst++;\
2465
        tmp++;\
2466
    }\
2467
}\
2468
\
2469
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2470
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2471
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2472
    src += 8*srcStride;\
2473
    dst += 8*dstStride;\
2474
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2475
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2476
}\
2477
\
2478
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2479
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2480
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2481
    src += 8*srcStride;\
2482
    dst += 8*dstStride;\
2483
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2484
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2485
}\
2486
\
2487
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2488
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2489
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2490
    src += 8*srcStride;\
2491
    dst += 8*dstStride;\
2492
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2493
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2494
}\
2495

    
2496
#define H264_MC(OPNAME, SIZE) \
2497
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2498
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2499
}\
2500
\
2501
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2502
    uint8_t half[SIZE*SIZE];\
2503
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2504
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2505
}\
2506
\
2507
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2508
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2509
}\
2510
\
2511
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2512
    uint8_t half[SIZE*SIZE];\
2513
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2514
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2515
}\
2516
\
2517
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2518
    uint8_t full[SIZE*(SIZE+5)];\
2519
    uint8_t * const full_mid= full + SIZE*2;\
2520
    uint8_t half[SIZE*SIZE];\
2521
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2522
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2523
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2524
}\
2525
\
2526
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2527
    uint8_t full[SIZE*(SIZE+5)];\
2528
    uint8_t * const full_mid= full + SIZE*2;\
2529
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2530
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2531
}\
2532
\
2533
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2534
    uint8_t full[SIZE*(SIZE+5)];\
2535
    uint8_t * const full_mid= full + SIZE*2;\
2536
    uint8_t half[SIZE*SIZE];\
2537
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2538
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2539
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2540
}\
2541
\
2542
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2543
    uint8_t full[SIZE*(SIZE+5)];\
2544
    uint8_t * const full_mid= full + SIZE*2;\
2545
    uint8_t halfH[SIZE*SIZE];\
2546
    uint8_t halfV[SIZE*SIZE];\
2547
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2548
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2549
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2550
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2551
}\
2552
\
2553
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2554
    uint8_t full[SIZE*(SIZE+5)];\
2555
    uint8_t * const full_mid= full + SIZE*2;\
2556
    uint8_t halfH[SIZE*SIZE];\
2557
    uint8_t halfV[SIZE*SIZE];\
2558
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2559
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2560
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2561
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2562
}\
2563
\
2564
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2565
    uint8_t full[SIZE*(SIZE+5)];\
2566
    uint8_t * const full_mid= full + SIZE*2;\
2567
    uint8_t halfH[SIZE*SIZE];\
2568
    uint8_t halfV[SIZE*SIZE];\
2569
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2570
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2571
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2572
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2573
}\
2574
\
2575
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2576
    uint8_t full[SIZE*(SIZE+5)];\
2577
    uint8_t * const full_mid= full + SIZE*2;\
2578
    uint8_t halfH[SIZE*SIZE];\
2579
    uint8_t halfV[SIZE*SIZE];\
2580
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2581
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2582
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2583
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2584
}\
2585
\
2586
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2587
    int16_t tmp[SIZE*(SIZE+5)];\
2588
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2589
}\
2590
\
2591
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2592
    int16_t tmp[SIZE*(SIZE+5)];\
2593
    uint8_t halfH[SIZE*SIZE];\
2594
    uint8_t halfHV[SIZE*SIZE];\
2595
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2596
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2597
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2598
}\
2599
\
2600
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2601
    int16_t tmp[SIZE*(SIZE+5)];\
2602
    uint8_t halfH[SIZE*SIZE];\
2603
    uint8_t halfHV[SIZE*SIZE];\
2604
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2605
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2606
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2607
}\
2608
\
2609
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2610
    uint8_t full[SIZE*(SIZE+5)];\
2611
    uint8_t * const full_mid= full + SIZE*2;\
2612
    int16_t tmp[SIZE*(SIZE+5)];\
2613
    uint8_t halfV[SIZE*SIZE];\
2614
    uint8_t halfHV[SIZE*SIZE];\
2615
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2616
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2617
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2618
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2619
}\
2620
\
2621
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2622
    uint8_t full[SIZE*(SIZE+5)];\
2623
    uint8_t * const full_mid= full + SIZE*2;\
2624
    int16_t tmp[SIZE*(SIZE+5)];\
2625
    uint8_t halfV[SIZE*SIZE];\
2626
    uint8_t halfHV[SIZE*SIZE];\
2627
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2628
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2629
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2630
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2631
}\
2632

    
2633
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2634
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2635
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2636
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2637
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2638

    
2639
H264_LOWPASS(put_       , op_put, op2_put)
2640
H264_LOWPASS(avg_       , op_avg, op2_avg)
2641
H264_MC(put_, 2)
2642
H264_MC(put_, 4)
2643
H264_MC(put_, 8)
2644
H264_MC(put_, 16)
2645
H264_MC(avg_, 4)
2646
H264_MC(avg_, 8)
2647
H264_MC(avg_, 16)
2648

    
2649
#undef op_avg
2650
#undef op_put
2651
#undef op2_avg
2652
#undef op2_put
2653
#endif
2654

    
2655
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2656
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2657
#define H264_WEIGHT(W,H) \
2658
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2659
    int y; \
2660
    offset <<= log2_denom; \
2661
    if(log2_denom) offset += 1<<(log2_denom-1); \
2662
    for(y=0; y<H; y++, block += stride){ \
2663
        op_scale1(0); \
2664
        op_scale1(1); \
2665
        if(W==2) continue; \
2666
        op_scale1(2); \
2667
        op_scale1(3); \
2668
        if(W==4) continue; \
2669
        op_scale1(4); \
2670
        op_scale1(5); \
2671
        op_scale1(6); \
2672
        op_scale1(7); \
2673
        if(W==8) continue; \
2674
        op_scale1(8); \
2675
        op_scale1(9); \
2676
        op_scale1(10); \
2677
        op_scale1(11); \
2678
        op_scale1(12); \
2679
        op_scale1(13); \
2680
        op_scale1(14); \
2681
        op_scale1(15); \
2682
    } \
2683
} \
2684
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2685
    int y; \
2686
    offset = ((offset + 1) | 1) << log2_denom; \
2687
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2688
        op_scale2(0); \
2689
        op_scale2(1); \
2690
        if(W==2) continue; \
2691
        op_scale2(2); \
2692
        op_scale2(3); \
2693
        if(W==4) continue; \
2694
        op_scale2(4); \
2695
        op_scale2(5); \
2696
        op_scale2(6); \
2697
        op_scale2(7); \
2698
        if(W==8) continue; \
2699
        op_scale2(8); \
2700
        op_scale2(9); \
2701
        op_scale2(10); \
2702
        op_scale2(11); \
2703
        op_scale2(12); \
2704
        op_scale2(13); \
2705
        op_scale2(14); \
2706
        op_scale2(15); \
2707
    } \
2708
}
2709

    
2710
H264_WEIGHT(16,16)
2711
H264_WEIGHT(16,8)
2712
H264_WEIGHT(8,16)
2713
H264_WEIGHT(8,8)
2714
H264_WEIGHT(8,4)
2715
H264_WEIGHT(4,8)
2716
H264_WEIGHT(4,4)
2717
H264_WEIGHT(4,2)
2718
H264_WEIGHT(2,4)
2719
H264_WEIGHT(2,2)
2720

    
2721
#undef op_scale1
2722
#undef op_scale2
2723
#undef H264_WEIGHT
2724

    
2725
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2726
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2727
    int i;
2728

    
2729
    for(i=0; i<h; i++){
2730
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2731
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2732
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2733
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2734
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2735
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2736
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2737
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2738
        dst+=dstStride;
2739
        src+=srcStride;
2740
    }
2741
}
2742

    
2743
#if CONFIG_CAVS_DECODER
2744
/* AVS specific */
2745
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2746

    
2747
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2748
    put_pixels8_c(dst, src, stride, 8);
2749
}
2750
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2751
    avg_pixels8_c(dst, src, stride, 8);
2752
}
2753
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2754
    put_pixels16_c(dst, src, stride, 16);
2755
}
2756
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2757
    avg_pixels16_c(dst, src, stride, 16);
2758
}
2759
#endif /* CONFIG_CAVS_DECODER */
2760

    
2761
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2762

    
2763
#if CONFIG_VC1_DECODER
2764
/* VC-1 specific */
2765
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2766

    
2767
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2768
    put_pixels8_c(dst, src, stride, 8);
2769
}
2770
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2771
    avg_pixels8_c(dst, src, stride, 8);
2772
}
2773
#endif /* CONFIG_VC1_DECODER */
2774

    
2775
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2776

    
2777
/* H264 specific */
2778
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2779

    
2780
#if CONFIG_RV30_DECODER
2781
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2782
#endif /* CONFIG_RV30_DECODER */
2783

    
2784
#if CONFIG_RV40_DECODER
2785
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2786
    put_pixels16_xy2_c(dst, src, stride, 16);
2787
}
2788
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2789
    avg_pixels16_xy2_c(dst, src, stride, 16);
2790
}
2791
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2792
    put_pixels8_xy2_c(dst, src, stride, 8);
2793
}
2794
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2795
    avg_pixels8_xy2_c(dst, src, stride, 8);
2796
}
2797

    
2798
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2799
#endif /* CONFIG_RV40_DECODER */
2800

    
2801
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2802
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2803
    int i;
2804

    
2805
    for(i=0; i<w; i++){
2806
        const int src_1= src[ -srcStride];
2807
        const int src0 = src[0          ];
2808
        const int src1 = src[  srcStride];
2809
        const int src2 = src[2*srcStride];
2810
        const int src3 = src[3*srcStride];
2811
        const int src4 = src[4*srcStride];
2812
        const int src5 = src[5*srcStride];
2813
        const int src6 = src[6*srcStride];
2814
        const int src7 = src[7*srcStride];
2815
        const int src8 = src[8*srcStride];
2816
        const int src9 = src[9*srcStride];
2817
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2818
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2819
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2820
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2821
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2822
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2823
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2824
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2825
        src++;
2826
        dst++;
2827
    }
2828
}
2829

    
2830
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2831
    put_pixels8_c(dst, src, stride, 8);
2832
}
2833

    
2834
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2835
    uint8_t half[64];
2836
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2837
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2838
}
2839

    
2840
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2841
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2842
}
2843

    
2844
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2845
    uint8_t half[64];
2846
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2847
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2848
}
2849

    
2850
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2851
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2852
}
2853

    
2854
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2855
    uint8_t halfH[88];
2856
    uint8_t halfV[64];
2857
    uint8_t halfHV[64];
2858
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2859
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2860
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2861
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2862
}
2863
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2864
    uint8_t halfH[88];
2865
    uint8_t halfV[64];
2866
    uint8_t halfHV[64];
2867
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2868
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2869
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2870
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2871
}
2872
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2873
    uint8_t halfH[88];
2874
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2875
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2876
}
2877

    
2878
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2879
    if(CONFIG_ANY_H263) {
2880
    int x;
2881
    const int strength= ff_h263_loop_filter_strength[qscale];
2882

    
2883
    for(x=0; x<8; x++){
2884
        int d1, d2, ad1;
2885
        int p0= src[x-2*stride];
2886
        int p1= src[x-1*stride];
2887
        int p2= src[x+0*stride];
2888
        int p3= src[x+1*stride];
2889
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2890

    
2891
        if     (d<-2*strength) d1= 0;
2892
        else if(d<-  strength) d1=-2*strength - d;
2893
        else if(d<   strength) d1= d;
2894
        else if(d< 2*strength) d1= 2*strength - d;
2895
        else                   d1= 0;
2896

    
2897
        p1 += d1;
2898
        p2 -= d1;
2899
        if(p1&256) p1= ~(p1>>31);
2900
        if(p2&256) p2= ~(p2>>31);
2901

    
2902
        src[x-1*stride] = p1;
2903
        src[x+0*stride] = p2;
2904

    
2905
        ad1= FFABS(d1)>>1;
2906

    
2907
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2908

    
2909
        src[x-2*stride] = p0 - d2;
2910
        src[x+  stride] = p3 + d2;
2911
    }
2912
    }
2913
}
2914

    
2915
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2916
    if(CONFIG_ANY_H263) {
2917
    int y;
2918
    const int strength= ff_h263_loop_filter_strength[qscale];
2919

    
2920
    for(y=0; y<8; y++){
2921
        int d1, d2, ad1;
2922
        int p0= src[y*stride-2];
2923
        int p1= src[y*stride-1];
2924
        int p2= src[y*stride+0];
2925
        int p3= src[y*stride+1];
2926
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2927

    
2928
        if     (d<-2*strength) d1= 0;
2929
        else if(d<-  strength) d1=-2*strength - d;
2930
        else if(d<   strength) d1= d;
2931
        else if(d< 2*strength) d1= 2*strength - d;
2932
        else                   d1= 0;
2933

    
2934
        p1 += d1;
2935
        p2 -= d1;
2936
        if(p1&256) p1= ~(p1>>31);
2937
        if(p2&256) p2= ~(p2>>31);
2938

    
2939
        src[y*stride-1] = p1;
2940
        src[y*stride+0] = p2;
2941

    
2942
        ad1= FFABS(d1)>>1;
2943

    
2944
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2945

    
2946
        src[y*stride-2] = p0 - d2;
2947
        src[y*stride+1] = p3 + d2;
2948
    }
2949
    }
2950
}
2951

    
2952
static void h261_loop_filter_c(uint8_t *src, int stride){
2953
    int x,y,xy,yz;
2954
    int temp[64];
2955

    
2956
    for(x=0; x<8; x++){
2957
        temp[x      ] = 4*src[x           ];
2958
        temp[x + 7*8] = 4*src[x + 7*stride];
2959
    }
2960
    for(y=1; y<7; y++){
2961
        for(x=0; x<8; x++){
2962
            xy = y * stride + x;
2963
            yz = y * 8 + x;
2964
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2965
        }
2966
    }
2967

    
2968
    for(y=0; y<8; y++){
2969
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2970
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2971
        for(x=1; x<7; x++){
2972
            xy = y * stride + x;
2973
            yz = y * 8 + x;
2974
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2975
        }
2976
    }
2977
}
2978

    
2979
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2980
{
2981
    int i, d;
2982
    for( i = 0; i < 4; i++ ) {
2983
        if( tc0[i] < 0 ) {
2984
            pix += 4*ystride;
2985
            continue;
2986
        }
2987
        for( d = 0; d < 4; d++ ) {
2988
            const int p0 = pix[-1*xstride];
2989
            const int p1 = pix[-2*xstride];
2990
            const int p2 = pix[-3*xstride];
2991
            const int q0 = pix[0];
2992
            const int q1 = pix[1*xstride];
2993
            const int q2 = pix[2*xstride];
2994

    
2995
            if( FFABS( p0 - q0 ) < alpha &&
2996
                FFABS( p1 - p0 ) < beta &&
2997
                FFABS( q1 - q0 ) < beta ) {
2998

    
2999
                int tc = tc0[i];
3000
                int i_delta;
3001

    
3002
                if( FFABS( p2 - p0 ) < beta ) {
3003
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3004
                    tc++;
3005
                }
3006
                if( FFABS( q2 - q0 ) < beta ) {
3007
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3008
                    tc++;
3009
                }
3010

    
3011
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3012
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3013
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3014
            }
3015
            pix += ystride;
3016
        }
3017
    }
3018
}
3019
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3020
{
3021
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3022
}
3023
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3024
{
3025
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3026
}
3027

    
3028
static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3029
{
3030
    int d;
3031
    for( d = 0; d < 16; d++ ) {
3032
        const int p2 = pix[-3*xstride];
3033
        const int p1 = pix[-2*xstride];
3034
        const int p0 = pix[-1*xstride];
3035

    
3036
        const int q0 = pix[ 0*xstride];
3037
        const int q1 = pix[ 1*xstride];
3038
        const int q2 = pix[ 2*xstride];
3039

    
3040
        if( FFABS( p0 - q0 ) < alpha &&
3041
            FFABS( p1 - p0 ) < beta &&
3042
            FFABS( q1 - q0 ) < beta ) {
3043

    
3044
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3045
                if( FFABS( p2 - p0 ) < beta)
3046
                {
3047
                    const int p3 = pix[-4*xstride];
3048
                    /* p0', p1', p2' */
3049
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3050
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3051
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3052
                } else {
3053
                    /* p0' */
3054
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3055
                }
3056
                if( FFABS( q2 - q0 ) < beta)
3057
                {
3058
                    const int q3 = pix[3*xstride];
3059
                    /* q0', q1', q2' */
3060
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3061
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3062
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3063
                } else {
3064
                    /* q0' */
3065
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3066
                }
3067
            }else{
3068
                /* p0', q0' */
3069
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3070
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3071
            }
3072
        }
3073
        pix += ystride;
3074
    }
3075
}
3076
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3077
{
3078
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3079
}
3080
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3081
{
3082
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3083
}
3084

    
3085
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3086
{
3087
    int i, d;
3088
    for( i = 0; i < 4; i++ ) {
3089
        const int tc = tc0[i];
3090
        if( tc <= 0 ) {
3091
            pix += 2*ystride;
3092
            continue;
3093
        }
3094
        for( d = 0; d < 2; d++ ) {
3095
            const int p0 = pix[-1*xstride];
3096
            const int p1 = pix[-2*xstride];
3097
            const int q0 = pix[0];
3098
            const int q1 = pix[1*xstride];
3099

    
3100
            if( FFABS( p0 - q0 ) < alpha &&
3101
                FFABS( p1 - p0 ) < beta &&
3102
                FFABS( q1 - q0 ) < beta ) {
3103

    
3104
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3105

    
3106
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3107
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3108
            }
3109
            pix += ystride;
3110
        }
3111
    }
3112
}
3113
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3114
{
3115
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3116
}
3117
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3118
{
3119
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3120
}
3121

    
3122
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3123
{
3124
    int d;
3125
    for( d = 0; d < 8; d++ ) {
3126
        const int p0 = pix[-1*xstride];
3127
        const int p1 = pix[-2*xstride];
3128
        const int q0 = pix[0];
3129
        const int q1 = pix[1*xstride];
3130

    
3131
        if( FFABS( p0 - q0 ) < alpha &&
3132
            FFABS( p1 - p0 ) < beta &&
3133
            FFABS( q1 - q0 ) < beta ) {
3134

    
3135
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3136
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3137
        }
3138
        pix += ystride;
3139
    }
3140
}
3141
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3142
{
3143
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3144
}
3145
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3146
{
3147
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3148
}
3149

    
3150
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3151
{
3152
    int s, i;
3153

    
3154
    s = 0;
3155
    for(i=0;i<h;i++) {
3156
        s += abs(pix1[0] - pix2[0]);
3157
        s += abs(pix1[1] - pix2[1]);
3158
        s += abs(pix1[2] - pix2[2]);
3159
        s += abs(pix1[3] - pix2[3]);
3160
        s += abs(pix1[4] - pix2[4]);
3161
        s += abs(pix1[5] - pix2[5]);
3162
        s += abs(pix1[6] - pix2[6]);
3163
        s += abs(pix1[7] - pix2[7]);
3164
        s += abs(pix1[8] - pix2[8]);
3165
        s += abs(pix1[9] - pix2[9]);
3166
        s += abs(pix1[10] - pix2[10]);
3167
        s += abs(pix1[11] - pix2[11]);
3168
        s += abs(pix1[12] - pix2[12]);
3169
        s += abs(pix1[13] - pix2[13]);
3170
        s += abs(pix1[14] - pix2[14]);
3171
        s += abs(pix1[15] - pix2[15]);
3172
        pix1 += line_size;
3173
        pix2 += line_size;
3174
    }
3175
    return s;
3176
}
3177

    
3178
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3179
{
3180
    int s, i;
3181

    
3182
    s = 0;
3183
    for(i=0;i<h;i++) {
3184
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3185
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3186
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3187
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3188
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3189
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3190
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3191
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3192
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3193
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3194
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3195
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3196
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3197
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3198
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3199
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3200
        pix1 += line_size;
3201
        pix2 += line_size;
3202
    }
3203
    return s;
3204
}
3205

    
3206
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3207
{
3208
    int s, i;
3209
    uint8_t *pix3 = pix2 + line_size;
3210

    
3211
    s = 0;
3212
    for(i=0;i<h;i++) {
3213
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3214
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3215
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3216
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3217
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3218
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3219
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3220
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3221
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3222
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3223
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3224
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3225
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3226
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3227
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3228
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3229
        pix1 += line_size;
3230
        pix2 += line_size;
3231
        pix3 += line_size;
3232
    }
3233
    return s;
3234
}
3235

    
3236
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3237
{
3238
    int s, i;
3239
    uint8_t *pix3 = pix2 + line_size;
3240

    
3241
    s = 0;
3242
    for(i=0;i<h;i++) {
3243
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3244
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3245
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3246
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3247
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3248
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3249
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3250
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3251
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3252
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3253
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3254
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3255
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3256
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3257
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3258
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3259
        pix1 += line_size;
3260
        pix2 += line_size;
3261
        pix3 += line_size;
3262
    }
3263
    return s;
3264
}
3265

    
3266
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3267
{
3268
    int s, i;
3269

    
3270
    s = 0;
3271
    for(i=0;i<h;i++) {
3272
        s += abs(pix1[0] - pix2[0]);
3273
        s += abs(pix1[1] - pix2[1]);
3274
        s += abs(pix1[2] - pix2[2]);
3275
        s += abs(pix1[3] - pix2[3]);
3276
        s += abs(pix1[4] - pix2[4]);
3277
        s += abs(pix1[5] - pix2[5]);
3278
        s += abs(pix1[6] - pix2[6]);
3279
        s += abs(pix1[7] - pix2[7]);
3280
        pix1 += line_size;
3281
        pix2 += line_size;
3282
    }
3283
    return s;
3284
}
3285

    
3286
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3287
{
3288
    int s, i;
3289

    
3290
    s = 0;
3291
    for(i=0;i<h;i++) {
3292
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3293
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3294
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3295
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3296
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3297
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3298
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3299
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3300
        pix1 += line_size;
3301
        pix2 += line_size;
3302
    }
3303
    return s;
3304
}
3305

    
3306
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3307
{
3308
    int s, i;
3309
    uint8_t *pix3 = pix2 + line_size;
3310

    
3311
    s = 0;
3312
    for(i=0;i<h;i++) {
3313
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3314
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3315
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3316
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3317
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3318
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3319
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3320
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3321
        pix1 += line_size;
3322
        pix2 += line_size;
3323
        pix3 += line_size;
3324
    }
3325
    return s;
3326
}
3327

    
3328
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3329
{
3330
    int s, i;
3331
    uint8_t *pix3 = pix2 + line_size;
3332

    
3333
    s = 0;
3334
    for(i=0;i<h;i++) {
3335
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3336
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3337
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3338
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3339
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3340
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3341
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3342
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3343
        pix1 += line_size;
3344
        pix2 += line_size;
3345
        pix3 += line_size;
3346
    }
3347
    return s;
3348
}
3349

    
3350
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3351
    MpegEncContext *c = v;
3352
    int score1=0;
3353
    int score2=0;
3354
    int x,y;
3355

    
3356
    for(y=0; y<h; y++){
3357
        for(x=0; x<16; x++){
3358
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3359
        }
3360
        if(y+1<h){
3361
            for(x=0; x<15; x++){
3362
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3363
                             - s1[x+1] + s1[x+1+stride])
3364
                        -FFABS(  s2[x  ] - s2[x  +stride]
3365
                             - s2[x+1] + s2[x+1+stride]);
3366
            }
3367
        }
3368
        s1+= stride;
3369
        s2+= stride;
3370
    }
3371

    
3372
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3373
    else  return score1 + FFABS(score2)*8;
3374
}
3375

    
3376
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3377
    MpegEncContext *c = v;
3378
    int score1=0;
3379
    int score2=0;
3380
    int x,y;
3381

    
3382
    for(y=0; y<h; y++){
3383
        for(x=0; x<8; x++){
3384
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3385
        }
3386
        if(y+1<h){
3387
            for(x=0; x<7; x++){
3388
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3389
                             - s1[x+1] + s1[x+1+stride])
3390
                        -FFABS(  s2[x  ] - s2[x  +stride]
3391
                             - s2[x+1] + s2[x+1+stride]);
3392
            }
3393
        }
3394
        s1+= stride;
3395
        s2+= stride;
3396
    }
3397

    
3398
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3399
    else  return score1 + FFABS(score2)*8;
3400
}
3401

    
3402
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3403
    int i;
3404
    unsigned int sum=0;
3405

    
3406
    for(i=0; i<8*8; i++){
3407
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3408
        int w= weight[i];
3409
        b>>= RECON_SHIFT;
3410
        assert(-512<b && b<512);
3411

    
3412
        sum += (w*b)*(w*b)>>4;
3413
    }
3414
    return sum>>2;
3415
}
3416

    
3417
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3418
    int i;
3419

    
3420
    for(i=0; i<8*8; i++){
3421
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3422
    }
3423
}
3424

    
3425
/**
3426
 * permutes an 8x8 block.
3427
 * @param block the block which will be permuted according to the given permutation vector
3428
 * @param permutation the permutation vector
3429
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3430
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3431
 *                  (inverse) permutated to scantable order!
3432
 */
3433
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3434
{
3435
    int i;
3436
    DCTELEM temp[64];
3437

    
3438
    if(last<=0) return;
3439
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3440

    
3441
    for(i=0; i<=last; i++){
3442
        const int j= scantable[i];
3443
        temp[j]= block[j];
3444
        block[j]=0;
3445
    }
3446

    
3447
    for(i=0; i<=last; i++){
3448
        const int j= scantable[i];
3449
        const int perm_j= permutation[j];
3450
        block[perm_j]= temp[j];
3451
    }
3452
}
3453

    
3454
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3455
    return 0;
3456
}
3457

    
3458
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3459
    int i;
3460

    
3461
    memset(cmp, 0, sizeof(void*)*6);
3462

    
3463
    for(i=0; i<6; i++){
3464
        switch(type&0xFF){
3465
        case FF_CMP_SAD:
3466
            cmp[i]= c->sad[i];
3467
            break;
3468
        case FF_CMP_SATD:
3469
            cmp[i]= c->hadamard8_diff[i];
3470
            break;
3471
        case FF_CMP_SSE:
3472
            cmp[i]= c->sse[i];
3473
            break;
3474
        case FF_CMP_DCT:
3475
            cmp[i]= c->dct_sad[i];
3476
            break;
3477
        case FF_CMP_DCT264:
3478
            cmp[i]= c->dct264_sad[i];
3479
            break;
3480
        case FF_CMP_DCTMAX:
3481
            cmp[i]= c->dct_max[i];
3482
            break;
3483
        case FF_CMP_PSNR:
3484
            cmp[i]= c->quant_psnr[i];
3485
            break;
3486
        case FF_CMP_BIT:
3487
            cmp[i]= c->bit[i];
3488
            break;
3489
        case FF_CMP_RD:
3490
            cmp[i]= c->rd[i];
3491
            break;
3492
        case FF_CMP_VSAD:
3493
            cmp[i]= c->vsad[i];
3494
            break;
3495
        case FF_CMP_VSSE:
3496
            cmp[i]= c->vsse[i];
3497
            break;
3498
        case FF_CMP_ZERO:
3499
            cmp[i]= zero_cmp;
3500
            break;
3501
        case FF_CMP_NSSE:
3502
            cmp[i]= c->nsse[i];
3503
            break;
3504
#if CONFIG_SNOW_ENCODER
3505
        case FF_CMP_W53:
3506
            cmp[i]= c->w53[i];
3507
            break;
3508
        case FF_CMP_W97:
3509
            cmp[i]= c->w97[i];
3510
            break;
3511
#endif
3512
        default:
3513
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3514
        }
3515
    }
3516
}
3517

    
3518
static void clear_block_c(DCTELEM *block)
3519
{
3520
    memset(block, 0, sizeof(DCTELEM)*64);
3521
}
3522

    
3523
/**
3524
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3525
 */
3526
static void clear_blocks_c(DCTELEM *blocks)
3527
{
3528
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3529
}
3530

    
3531
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3532
    long i;
3533
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3534
        long a = *(long*)(src+i);
3535
        long b = *(long*)(dst+i);
3536
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3537
    }
3538
    for(; i<w; i++)
3539
        dst[i+0] += src[i+0];
3540
}
3541

    
3542
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3543
    long i;
3544
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3545
        long a = *(long*)(src1+i);
3546
        long b = *(long*)(src2+i);
3547
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3548
    }
3549
    for(; i<w; i++)
3550
        dst[i] = src1[i]+src2[i];
3551
}
3552

    
3553
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3554
    long i;
3555
#if !HAVE_FAST_UNALIGNED
3556
    if((long)src2 & (sizeof(long)-1)){
3557
        for(i=0; i+7<w; i+=8){
3558
            dst[i+0] = src1[i+0]-src2[i+0];
3559
            dst[i+1] = src1[i+1]-src2[i+1];
3560
            dst[i+2] = src1[i+2]-src2[i+2];
3561
            dst[i+3] = src1[i+3]-src2[i+3];
3562
            dst[i+4] = src1[i+4]-src2[i+4];
3563
            dst[i+5] = src1[i+5]-src2[i+5];
3564
            dst[i+6] = src1[i+6]-src2[i+6];
3565
            dst[i+7] = src1[i+7]-src2[i+7];
3566
        }
3567
    }else
3568
#endif
3569
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3570
        long a = *(long*)(src1+i);
3571
        long b = *(long*)(src2+i);
3572
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3573
    }
3574
    for(; i<w; i++)
3575
        dst[i+0] = src1[i+0]-src2[i+0];
3576
}
3577

    
3578
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3579
    int i;
3580
    uint8_t l, lt;
3581

    
3582
    l= *left;
3583
    lt= *left_top;
3584

    
3585
    for(i=0; i<w; i++){
3586
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3587
        lt= src1[i];
3588
        dst[i]= l;
3589
    }
3590

    
3591
    *left= l;
3592
    *left_top= lt;
3593
}
3594

    
3595
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3596
    int i;
3597
    uint8_t l, lt;
3598

    
3599
    l= *left;
3600
    lt= *left_top;
3601

    
3602
    for(i=0; i<w; i++){
3603
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3604
        lt= src1[i];
3605
        l= src2[i];
3606
        dst[i]= l - pred;
3607
    }
3608

    
3609
    *left= l;
3610
    *left_top= lt;
3611
}
3612

    
3613
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3614
    int i;
3615

    
3616
    for(i=0; i<w-1; i++){
3617
        acc+= src[i];
3618
        dst[i]= acc;
3619
        i++;
3620
        acc+= src[i];
3621
        dst[i]= acc;
3622
    }
3623

    
3624
    for(; i<w; i++){
3625
        acc+= src[i];
3626
        dst[i]= acc;
3627
    }
3628

    
3629
    return acc;
3630
}
3631

    
3632
#if HAVE_BIGENDIAN
3633
#define B 3
3634
#define G 2
3635
#define R 1
3636
#else
3637
#define B 0
3638
#define G 1
3639
#define R 2
3640
#endif
3641
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue){
3642
    int i;
3643
    int r,g,b;
3644
    r= *red;
3645
    g= *green;
3646
    b= *blue;
3647

    
3648
    for(i=0; i<w; i++){
3649
        b+= src[4*i+B];
3650
        g+= src[4*i+G];
3651
        r+= src[4*i+R];
3652

    
3653
        dst[4*i+B]= b;
3654
        dst[4*i+G]= g;
3655
        dst[4*i+R]= r;
3656
    }
3657

    
3658
    *red= r;
3659
    *green= g;
3660
    *blue= b;
3661
}
3662
#undef B
3663
#undef G
3664
#undef R
3665

    
3666
#define BUTTERFLY2(o1,o2,i1,i2) \
3667
o1= (i1)+(i2);\
3668
o2= (i1)-(i2);
3669

    
3670
#define BUTTERFLY1(x,y) \
3671
{\
3672
    int a,b;\
3673
    a= x;\
3674
    b= y;\
3675
    x= a+b;\
3676
    y= a-b;\
3677
}
3678

    
3679
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3680

    
3681
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3682
    int i;
3683
    int temp[64];
3684
    int sum=0;
3685

    
3686
    assert(h==8);
3687

    
3688
    for(i=0; i<8; i++){
3689
        //FIXME try pointer walks
3690
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3691
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3692
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3693
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3694

    
3695
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3696
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3697
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3698
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3699

    
3700
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3701
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3702
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3703
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3704
    }
3705

    
3706
    for(i=0; i<8; i++){
3707
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3708
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3709
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3710
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3711

    
3712
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3713
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3714
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3715
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3716

    
3717
        sum +=
3718
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3719
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3720
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3721
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3722
    }
3723
#if 0
3724
static int maxi=0;
3725
if(sum>maxi){
3726
    maxi=sum;
3727
    printf("MAX:%d\n", maxi);
3728
}
3729
#endif
3730
    return sum;
3731
}
3732

    
3733
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3734
    int i;
3735
    int temp[64];
3736
    int sum=0;
3737

    
3738
    assert(h==8);
3739

    
3740
    for(i=0; i<8; i++){
3741
        //FIXME try pointer walks
3742
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3743
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3744
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3745
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3746

    
3747
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3748
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3749
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3750
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3751

    
3752
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3753
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3754
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3755
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3756
    }
3757

    
3758
    for(i=0; i<8; i++){
3759
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3760
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3761
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3762
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3763

    
3764
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3765
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3766
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3767
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3768

    
3769
        sum +=
3770
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3771
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3772
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3773
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3774
    }
3775

    
3776
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3777

    
3778
    return sum;
3779
}
3780

    
3781
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3782
    MpegEncContext * const s= (MpegEncContext *)c;
3783
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3784
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3785

    
3786
    assert(h==8);
3787

    
3788
    s->dsp.diff_pixels(temp, src1, src2, stride);
3789
    s->dsp.fdct(temp);
3790
    return s->dsp.sum_abs_dctelem(temp);
3791
}
3792

    
3793
#if CONFIG_GPL
3794
#define DCT8_1D {\
3795
    const int s07 = SRC(0) + SRC(7);\
3796
    const int s16 = SRC(1) + SRC(6);\
3797
    const int s25 = SRC(2) + SRC(5);\
3798
    const int s34 = SRC(3) + SRC(4);\
3799
    const int a0 = s07 + s34;\
3800
    const int a1 = s16 + s25;\
3801
    const int a2 = s07 - s34;\
3802
    const int a3 = s16 - s25;\
3803
    const int d07 = SRC(0) - SRC(7);\
3804
    const int d16 = SRC(1) - SRC(6);\
3805
    const int d25 = SRC(2) - SRC(5);\
3806
    const int d34 = SRC(3) - SRC(4);\
3807
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3808
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3809
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3810
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3811
    DST(0,  a0 + a1     ) ;\
3812
    DST(1,  a4 + (a7>>2)) ;\
3813
    DST(2,  a2 + (a3>>1)) ;\
3814
    DST(3,  a5 + (a6>>2)) ;\
3815
    DST(4,  a0 - a1     ) ;\
3816
    DST(5,  a6 - (a5>>2)) ;\
3817
    DST(6, (a2>>1) - a3 ) ;\
3818
    DST(7, (a4>>2) - a7 ) ;\
3819
}
3820

    
3821
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3822
    MpegEncContext * const s= (MpegEncContext *)c;
3823
    DCTELEM dct[8][8];
3824
    int i;
3825
    int sum=0;
3826

    
3827
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3828

    
3829
#define SRC(x) dct[i][x]
3830
#define DST(x,v) dct[i][x]= v
3831
    for( i = 0; i < 8; i++ )
3832
        DCT8_1D
3833
#undef SRC
3834
#undef DST
3835

    
3836
#define SRC(x) dct[x][i]
3837
#define DST(x,v) sum += FFABS(v)
3838
    for( i = 0; i < 8; i++ )
3839
        DCT8_1D
3840
#undef SRC
3841
#undef DST
3842
    return sum;
3843
}
3844
#endif
3845

    
3846
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3847
    MpegEncContext * const s= (MpegEncContext *)c;
3848
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3849
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3850
    int sum=0, i;
3851

    
3852
    assert(h==8);
3853

    
3854
    s->dsp.diff_pixels(temp, src1, src2, stride);
3855
    s->dsp.fdct(temp);
3856

    
3857
    for(i=0; i<64; i++)
3858
        sum= FFMAX(sum, FFABS(temp[i]));
3859

    
3860
    return sum;
3861
}
3862

    
3863
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3864
    MpegEncContext * const s= (MpegEncContext *)c;
3865
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3866
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3867
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3868
    int sum=0, i;
3869

    
3870
    assert(h==8);
3871
    s->mb_intra=0;
3872

    
3873
    s->dsp.diff_pixels(temp, src1, src2, stride);
3874

    
3875
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3876

    
3877
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3878
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3879
    ff_simple_idct(temp); //FIXME
3880

    
3881
    for(i=0; i<64; i++)
3882
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3883

    
3884
    return sum;
3885
}
3886

    
3887
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3888
    MpegEncContext * const s= (MpegEncContext *)c;
3889
    const uint8_t *scantable= s->intra_scantable.permutated;
3890
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3891
    DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3892
    DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3893
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3894
    uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3895
    uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3896
    int i, last, run, bits, level, distortion, start_i;
3897
    const int esc_length= s->ac_esc_length;
3898
    uint8_t * length;
3899
    uint8_t * last_length;
3900

    
3901
    assert(h==8);
3902

    
3903
    copy_block8(lsrc1, src1, 8, stride, 8);
3904
    copy_block8(lsrc2, src2, 8, stride, 8);
3905

    
3906
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3907

    
3908
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3909

    
3910
    bits=0;
3911

    
3912
    if (s->mb_intra) {
3913
        start_i = 1;
3914
        length     = s->intra_ac_vlc_length;
3915
        last_length= s->intra_ac_vlc_last_length;
3916
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3917
    } else {
3918
        start_i = 0;
3919
        length     = s->inter_ac_vlc_length;
3920
        last_length= s->inter_ac_vlc_last_length;
3921
    }
3922

    
3923
    if(last>=start_i){
3924
        run=0;
3925
        for(i=start_i; i<last; i++){
3926
            int j= scantable[i];
3927
            level= temp[j];
3928

    
3929
            if(level){
3930
                level+=64;
3931
                if((level&(~127)) == 0){
3932
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3933
                }else
3934
                    bits+= esc_length;
3935
                run=0;
3936
            }else
3937
                run++;
3938
        }
3939
        i= scantable[last];
3940

    
3941
        level= temp[i] + 64;
3942

    
3943
        assert(level - 64);
3944

    
3945
        if((level&(~127)) == 0){
3946
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3947
        }else
3948
            bits+= esc_length;
3949

    
3950
    }
3951

    
3952
    if(last>=0){
3953
        if(s->mb_intra)
3954
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3955
        else
3956
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3957
    }
3958

    
3959
    s->dsp.idct_add(lsrc2, 8, temp);
3960

    
3961
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3962

    
3963
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3964
}
3965

    
3966
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3967
    MpegEncContext * const s= (MpegEncContext *)c;
3968
    const uint8_t *scantable= s->intra_scantable.permutated;
3969
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3970
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3971
    int i, last, run, bits, level, start_i;
3972
    const int esc_length= s->ac_esc_length;
3973
    uint8_t * length;
3974
    uint8_t * last_length;
3975

    
3976
    assert(h==8);
3977

    
3978
    s->dsp.diff_pixels(temp, src1, src2, stride);
3979

    
3980
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3981

    
3982
    bits=0;
3983

    
3984
    if (s->mb_intra) {
3985
        start_i = 1;
3986
        length     = s->intra_ac_vlc_length;
3987
        last_length= s->intra_ac_vlc_last_length;
3988
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3989
    } else {
3990
        start_i = 0;
3991
        length     = s->inter_ac_vlc_length;
3992
        last_length= s->inter_ac_vlc_last_length;
3993
    }
3994

    
3995
    if(last>=start_i){
3996
        run=0;
3997
        for(i=start_i; i<last; i++){
3998
            int j= scantable[i];
3999
            level= temp[j];
4000

    
4001
            if(level){
4002
                level+=64;
4003
                if((level&(~127)) == 0){
4004
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
4005
                }else
4006
                    bits+= esc_length;
4007
                run=0;
4008
            }else
4009
                run++;
4010
        }
4011
        i= scantable[last];
4012

    
4013
        level= temp[i] + 64;
4014

    
4015
        assert(level - 64);
4016

    
4017
        if((level&(~127)) == 0){
4018
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4019
        }else
4020
            bits+= esc_length;
4021
    }
4022

    
4023
    return bits;
4024
}
4025

    
4026
#define VSAD_INTRA(size) \
4027
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4028
    int score=0;                                                                                            \
4029
    int x,y;                                                                                                \
4030
                                                                                                            \
4031
    for(y=1; y<h; y++){                                                                                     \
4032
        for(x=0; x<size; x+=4){                                                                             \
4033
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4034
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4035
        }                                                                                                   \
4036
        s+= stride;                                                                                         \
4037
    }                                                                                                       \
4038
                                                                                                            \
4039
    return score;                                                                                           \
4040
}
4041
VSAD_INTRA(8)
4042
VSAD_INTRA(16)
4043

    
4044
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4045
    int score=0;
4046
    int x,y;
4047

    
4048
    for(y=1; y<h; y++){
4049
        for(x=0; x<16; x++){
4050
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4051
        }
4052
        s1+= stride;
4053
        s2+= stride;
4054
    }
4055

    
4056
    return score;
4057
}
4058

    
4059
#define SQ(a) ((a)*(a))
4060
#define VSSE_INTRA(size) \
4061
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4062
    int score=0;                                                                                            \
4063
    int x,y;                                                                                                \
4064
                                                                                                            \
4065
    for(y=1; y<h; y++){                                                                                     \
4066
        for(x=0; x<size; x+=4){                                                                               \
4067
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4068
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4069
        }                                                                                                   \
4070
        s+= stride;                                                                                         \
4071
    }                                                                                                       \
4072
                                                                                                            \
4073
    return score;                                                                                           \
4074
}
4075
VSSE_INTRA(8)
4076
VSSE_INTRA(16)
4077

    
4078
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4079
    int score=0;
4080
    int x,y;
4081

    
4082
    for(y=1; y<h; y++){
4083
        for(x=0; x<16; x++){
4084
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4085
        }
4086
        s1+= stride;
4087
        s2+= stride;
4088
    }
4089

    
4090
    return score;
4091
}
4092

    
4093
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4094
                               int size){
4095
    int score=0;
4096
    int i;
4097
    for(i=0; i<size; i++)
4098
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4099
    return score;
4100
}
4101

    
4102
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4103
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4104
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4105
#if CONFIG_GPL
4106
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4107
#endif
4108
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4109
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4110
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4111
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4112

    
4113
static void vector_fmul_c(float *dst, const float *src, int len){
4114
    int i;
4115
    for(i=0; i<len; i++)
4116
        dst[i] *= src[i];
4117
}
4118

    
4119
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4120
    int i;
4121
    src1 += len-1;
4122
    for(i=0; i<len; i++)
4123
        dst[i] = src0[i] * src1[-i];
4124
}
4125

    
4126
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4127
    int i;
4128
    for(i=0; i<len; i++)
4129
        dst[i] = src0[i] * src1[i] + src2[i];
4130
}
4131

    
4132
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4133
    int i,j;
4134
    dst += len;
4135
    win += len;
4136
    src0+= len;
4137
    for(i=-len, j=len-1; i<0; i++, j--) {
4138
        float s0 = src0[i];
4139
        float s1 = src1[j];
4140
        float wi = win[i];
4141
        float wj = win[j];
4142
        dst[i] = s0*wj - s1*wi + add_bias;
4143
        dst[j] = s0*wi + s1*wj + add_bias;
4144
    }
4145
}
4146

    
4147
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4148
                                 int len)
4149
{
4150
    int i;
4151
    for (i = 0; i < len; i++)
4152
        dst[i] = src[i] * mul;
4153
}
4154

    
4155
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4156
                                      const float **sv, float mul, int len)
4157
{
4158
    int i;
4159
    for (i = 0; i < len; i += 2, sv++) {
4160
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4161
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4162
    }
4163
}
4164

    
4165
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4166
                                      const float **sv, float mul, int len)
4167
{
4168
    int i;
4169
    for (i = 0; i < len; i += 4, sv++) {
4170
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4171
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4172
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4173
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4174
    }
4175
}
4176

    
4177
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4178
                               int len)
4179
{
4180
    int i;
4181
    for (i = 0; i < len; i += 2, sv++) {
4182
        dst[i  ] = sv[0][0] * mul;
4183
        dst[i+1] = sv[0][1] * mul;
4184
    }
4185
}
4186

    
4187
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4188
                               int len)
4189
{
4190
    int i;
4191
    for (i = 0; i < len; i += 4, sv++) {
4192
        dst[i  ] = sv[0][0] * mul;
4193
        dst[i+1] = sv[0][1] * mul;
4194
        dst[i+2] = sv[0][2] * mul;
4195
        dst[i+3] = sv[0][3] * mul;
4196
    }
4197
}
4198

    
4199
static void butterflies_float_c(float *restrict v1, float *restrict v2,
4200
                                int len)
4201
{
4202
    int i;
4203
    for (i = 0; i < len; i++) {
4204
        float t = v1[i] - v2[i];
4205
        v1[i] += v2[i];
4206
        v2[i] = t;
4207
    }
4208
}
4209

    
4210
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4211
{
4212
    float p = 0.0;
4213
    int i;
4214

    
4215
    for (i = 0; i < len; i++)
4216
        p += v1[i] * v2[i];
4217

    
4218
    return p;
4219
}
4220

    
4221
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4222
    int i;
4223
    for(i=0; i<len; i++)
4224
        dst[i] = src[i] * mul;
4225
}
4226

    
4227
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4228
                   uint32_t maxi, uint32_t maxisign)
4229
{
4230

    
4231
    if(a > mini) return mini;
4232