Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 3da11804

History | View | Annotate | Download (175 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "snow.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "lpc.h"
40
#include "ac3dec.h"
41
#include "vorbis.h"
42
#include "png.h"
43

    
44
/* snow.c */
45
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
46

    
47
/* eaidct.c */
48
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
49

    
50
/* binkidct.c */
51
void ff_bink_idct_c    (DCTELEM *block);
52
void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
53
void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
54

    
55
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
56
uint32_t ff_squareTbl[512] = {0, };
57

    
58
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59
#define pb_7f (~0UL/255 * 0x7f)
60
#define pb_80 (~0UL/255 * 0x80)
61

    
62
const uint8_t ff_zigzag_direct[64] = {
63
    0,   1,  8, 16,  9,  2,  3, 10,
64
    17, 24, 32, 25, 18, 11,  4,  5,
65
    12, 19, 26, 33, 40, 48, 41, 34,
66
    27, 20, 13,  6,  7, 14, 21, 28,
67
    35, 42, 49, 56, 57, 50, 43, 36,
68
    29, 22, 15, 23, 30, 37, 44, 51,
69
    58, 59, 52, 45, 38, 31, 39, 46,
70
    53, 60, 61, 54, 47, 55, 62, 63
71
};
72

    
73
/* Specific zigzag scan for 248 idct. NOTE that unlike the
74
   specification, we interleave the fields */
75
const uint8_t ff_zigzag248_direct[64] = {
76
     0,  8,  1,  9, 16, 24,  2, 10,
77
    17, 25, 32, 40, 48, 56, 33, 41,
78
    18, 26,  3, 11,  4, 12, 19, 27,
79
    34, 42, 49, 57, 50, 58, 35, 43,
80
    20, 28,  5, 13,  6, 14, 21, 29,
81
    36, 44, 51, 59, 52, 60, 37, 45,
82
    22, 30,  7, 15, 23, 31, 38, 46,
83
    53, 61, 54, 62, 39, 47, 55, 63,
84
};
85

    
86
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
87
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88

    
89
const uint8_t ff_alternate_horizontal_scan[64] = {
90
    0,  1,   2,  3,  8,  9, 16, 17,
91
    10, 11,  4,  5,  6,  7, 15, 14,
92
    13, 12, 19, 18, 24, 25, 32, 33,
93
    26, 27, 20, 21, 22, 23, 28, 29,
94
    30, 31, 34, 35, 40, 41, 48, 49,
95
    42, 43, 36, 37, 38, 39, 44, 45,
96
    46, 47, 50, 51, 56, 57, 58, 59,
97
    52, 53, 54, 55, 60, 61, 62, 63,
98
};
99

    
100
const uint8_t ff_alternate_vertical_scan[64] = {
101
    0,  8,  16, 24,  1,  9,  2, 10,
102
    17, 25, 32, 40, 48, 56, 57, 49,
103
    41, 33, 26, 18,  3, 11,  4, 12,
104
    19, 27, 34, 42, 50, 58, 35, 43,
105
    51, 59, 20, 28,  5, 13,  6, 14,
106
    21, 29, 36, 44, 52, 60, 37, 45,
107
    53, 61, 22, 30,  7, 15, 23, 31,
108
    38, 46, 54, 62, 39, 47, 55, 63,
109
};
110

    
111
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
112
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
113
const uint32_t ff_inverse[257]={
114
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
115
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
116
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
117
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
118
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
119
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
120
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
121
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
122
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
123
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
124
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
125
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
126
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
127
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
128
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
129
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
130
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
131
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
132
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
133
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
134
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
135
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
136
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
137
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
138
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
139
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
140
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
141
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
142
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
143
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
144
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
145
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
146
  16777216
147
};
148

    
149
/* Input permutation for the simple_idct_mmx */
150
static const uint8_t simple_mmx_permutation[64]={
151
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159
};
160

    
161
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162

    
163
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164
    int i;
165
    int end;
166

    
167
    st->scantable= src_scantable;
168

    
169
    for(i=0; i<64; i++){
170
        int j;
171
        j = src_scantable[i];
172
        st->permutated[i] = permutation[j];
173
#if ARCH_PPC
174
        st->inverse[j] = i;
175
#endif
176
    }
177

    
178
    end=-1;
179
    for(i=0; i<64; i++){
180
        int j;
181
        j = st->permutated[i];
182
        if(j>end) end=j;
183
        st->raster_end[i]= end;
184
    }
185
}
186

    
187
static int pix_sum_c(uint8_t * pix, int line_size)
188
{
189
    int s, i, j;
190

    
191
    s = 0;
192
    for (i = 0; i < 16; i++) {
193
        for (j = 0; j < 16; j += 8) {
194
            s += pix[0];
195
            s += pix[1];
196
            s += pix[2];
197
            s += pix[3];
198
            s += pix[4];
199
            s += pix[5];
200
            s += pix[6];
201
            s += pix[7];
202
            pix += 8;
203
        }
204
        pix += line_size - 16;
205
    }
206
    return s;
207
}
208

    
209
static int pix_norm1_c(uint8_t * pix, int line_size)
210
{
211
    int s, i, j;
212
    uint32_t *sq = ff_squareTbl + 256;
213

    
214
    s = 0;
215
    for (i = 0; i < 16; i++) {
216
        for (j = 0; j < 16; j += 8) {
217
#if 0
218
            s += sq[pix[0]];
219
            s += sq[pix[1]];
220
            s += sq[pix[2]];
221
            s += sq[pix[3]];
222
            s += sq[pix[4]];
223
            s += sq[pix[5]];
224
            s += sq[pix[6]];
225
            s += sq[pix[7]];
226
#else
227
#if LONG_MAX > 2147483647
228
            register uint64_t x=*(uint64_t*)pix;
229
            s += sq[x&0xff];
230
            s += sq[(x>>8)&0xff];
231
            s += sq[(x>>16)&0xff];
232
            s += sq[(x>>24)&0xff];
233
            s += sq[(x>>32)&0xff];
234
            s += sq[(x>>40)&0xff];
235
            s += sq[(x>>48)&0xff];
236
            s += sq[(x>>56)&0xff];
237
#else
238
            register uint32_t x=*(uint32_t*)pix;
239
            s += sq[x&0xff];
240
            s += sq[(x>>8)&0xff];
241
            s += sq[(x>>16)&0xff];
242
            s += sq[(x>>24)&0xff];
243
            x=*(uint32_t*)(pix+4);
244
            s += sq[x&0xff];
245
            s += sq[(x>>8)&0xff];
246
            s += sq[(x>>16)&0xff];
247
            s += sq[(x>>24)&0xff];
248
#endif
249
#endif
250
            pix += 8;
251
        }
252
        pix += line_size - 16;
253
    }
254
    return s;
255
}
256

    
257
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258
    int i;
259

    
260
    for(i=0; i+8<=w; i+=8){
261
        dst[i+0]= bswap_32(src[i+0]);
262
        dst[i+1]= bswap_32(src[i+1]);
263
        dst[i+2]= bswap_32(src[i+2]);
264
        dst[i+3]= bswap_32(src[i+3]);
265
        dst[i+4]= bswap_32(src[i+4]);
266
        dst[i+5]= bswap_32(src[i+5]);
267
        dst[i+6]= bswap_32(src[i+6]);
268
        dst[i+7]= bswap_32(src[i+7]);
269
    }
270
    for(;i<w; i++){
271
        dst[i+0]= bswap_32(src[i+0]);
272
    }
273
}
274

    
275
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276
{
277
    int s, i;
278
    uint32_t *sq = ff_squareTbl + 256;
279

    
280
    s = 0;
281
    for (i = 0; i < h; i++) {
282
        s += sq[pix1[0] - pix2[0]];
283
        s += sq[pix1[1] - pix2[1]];
284
        s += sq[pix1[2] - pix2[2]];
285
        s += sq[pix1[3] - pix2[3]];
286
        pix1 += line_size;
287
        pix2 += line_size;
288
    }
289
    return s;
290
}
291

    
292
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293
{
294
    int s, i;
295
    uint32_t *sq = ff_squareTbl + 256;
296

    
297
    s = 0;
298
    for (i = 0; i < h; i++) {
299
        s += sq[pix1[0] - pix2[0]];
300
        s += sq[pix1[1] - pix2[1]];
301
        s += sq[pix1[2] - pix2[2]];
302
        s += sq[pix1[3] - pix2[3]];
303
        s += sq[pix1[4] - pix2[4]];
304
        s += sq[pix1[5] - pix2[5]];
305
        s += sq[pix1[6] - pix2[6]];
306
        s += sq[pix1[7] - pix2[7]];
307
        pix1 += line_size;
308
        pix2 += line_size;
309
    }
310
    return s;
311
}
312

    
313
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314
{
315
    int s, i;
316
    uint32_t *sq = ff_squareTbl + 256;
317

    
318
    s = 0;
319
    for (i = 0; i < h; i++) {
320
        s += sq[pix1[ 0] - pix2[ 0]];
321
        s += sq[pix1[ 1] - pix2[ 1]];
322
        s += sq[pix1[ 2] - pix2[ 2]];
323
        s += sq[pix1[ 3] - pix2[ 3]];
324
        s += sq[pix1[ 4] - pix2[ 4]];
325
        s += sq[pix1[ 5] - pix2[ 5]];
326
        s += sq[pix1[ 6] - pix2[ 6]];
327
        s += sq[pix1[ 7] - pix2[ 7]];
328
        s += sq[pix1[ 8] - pix2[ 8]];
329
        s += sq[pix1[ 9] - pix2[ 9]];
330
        s += sq[pix1[10] - pix2[10]];
331
        s += sq[pix1[11] - pix2[11]];
332
        s += sq[pix1[12] - pix2[12]];
333
        s += sq[pix1[13] - pix2[13]];
334
        s += sq[pix1[14] - pix2[14]];
335
        s += sq[pix1[15] - pix2[15]];
336

    
337
        pix1 += line_size;
338
        pix2 += line_size;
339
    }
340
    return s;
341
}
342

    
343

    
344
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
345
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346
    int s, i, j;
347
    const int dec_count= w==8 ? 3 : 4;
348
    int tmp[32*32];
349
    int level, ori;
350
    static const int scale[2][2][4][4]={
351
      {
352
        {
353
            // 9/7 8x8 dec=3
354
            {268, 239, 239, 213},
355
            {  0, 224, 224, 152},
356
            {  0, 135, 135, 110},
357
        },{
358
            // 9/7 16x16 or 32x32 dec=4
359
            {344, 310, 310, 280},
360
            {  0, 320, 320, 228},
361
            {  0, 175, 175, 136},
362
            {  0, 129, 129, 102},
363
        }
364
      },{
365
        {
366
            // 5/3 8x8 dec=3
367
            {275, 245, 245, 218},
368
            {  0, 230, 230, 156},
369
            {  0, 138, 138, 113},
370
        },{
371
            // 5/3 16x16 or 32x32 dec=4
372
            {352, 317, 317, 286},
373
            {  0, 328, 328, 233},
374
            {  0, 180, 180, 140},
375
            {  0, 132, 132, 105},
376
        }
377
      }
378
    };
379

    
380
    for (i = 0; i < h; i++) {
381
        for (j = 0; j < w; j+=4) {
382
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386
        }
387
        pix1 += line_size;
388
        pix2 += line_size;
389
    }
390

    
391
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392

    
393
    s=0;
394
    assert(w==h);
395
    for(level=0; level<dec_count; level++){
396
        for(ori= level ? 1 : 0; ori<4; ori++){
397
            int size= w>>(dec_count-level);
398
            int sx= (ori&1) ? size : 0;
399
            int stride= 32<<(dec_count-level);
400
            int sy= (ori&2) ? stride>>1 : 0;
401

    
402
            for(i=0; i<size; i++){
403
                for(j=0; j<size; j++){
404
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405
                    s += FFABS(v);
406
                }
407
            }
408
        }
409
    }
410
    assert(s>=0);
411
    return s>>9;
412
}
413

    
414
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
416
}
417

    
418
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
420
}
421

    
422
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
424
}
425

    
426
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
428
}
429

    
430
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
432
}
433

    
434
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
436
}
437
#endif
438

    
439
/* draw the edges of width 'w' of an image of size width, height */
440
//FIXME check that this is ok for mpeg4 interlaced
441
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442
{
443
    uint8_t *ptr, *last_line;
444
    int i;
445

    
446
    last_line = buf + (height - 1) * wrap;
447
    for(i=0;i<w;i++) {
448
        /* top and bottom */
449
        memcpy(buf - (i + 1) * wrap, buf, width);
450
        memcpy(last_line + (i + 1) * wrap, last_line, width);
451
    }
452
    /* left and right */
453
    ptr = buf;
454
    for(i=0;i<height;i++) {
455
        memset(ptr - w, ptr[0], w);
456
        memset(ptr + width, ptr[width-1], w);
457
        ptr += wrap;
458
    }
459
    /* corners */
460
    for(i=0;i<w;i++) {
461
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465
    }
466
}
467

    
468
/**
469
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470
 * @param buf destination buffer
471
 * @param src source buffer
472
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473
 * @param block_w width of block
474
 * @param block_h height of block
475
 * @param src_x x coordinate of the top left sample of the block in the source buffer
476
 * @param src_y y coordinate of the top left sample of the block in the source buffer
477
 * @param w width of the source buffer
478
 * @param h height of the source buffer
479
 */
480
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481
                                    int src_x, int src_y, int w, int h){
482
    int x, y;
483
    int start_y, start_x, end_y, end_x;
484

    
485
    if(src_y>= h){
486
        src+= (h-1-src_y)*linesize;
487
        src_y=h-1;
488
    }else if(src_y<=-block_h){
489
        src+= (1-block_h-src_y)*linesize;
490
        src_y=1-block_h;
491
    }
492
    if(src_x>= w){
493
        src+= (w-1-src_x);
494
        src_x=w-1;
495
    }else if(src_x<=-block_w){
496
        src+= (1-block_w-src_x);
497
        src_x=1-block_w;
498
    }
499

    
500
    start_y= FFMAX(0, -src_y);
501
    start_x= FFMAX(0, -src_x);
502
    end_y= FFMIN(block_h, h-src_y);
503
    end_x= FFMIN(block_w, w-src_x);
504

    
505
    // copy existing part
506
    for(y=start_y; y<end_y; y++){
507
        for(x=start_x; x<end_x; x++){
508
            buf[x + y*linesize]= src[x + y*linesize];
509
        }
510
    }
511

    
512
    //top
513
    for(y=0; y<start_y; y++){
514
        for(x=start_x; x<end_x; x++){
515
            buf[x + y*linesize]= buf[x + start_y*linesize];
516
        }
517
    }
518

    
519
    //bottom
520
    for(y=end_y; y<block_h; y++){
521
        for(x=start_x; x<end_x; x++){
522
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
523
        }
524
    }
525

    
526
    for(y=0; y<block_h; y++){
527
       //left
528
        for(x=0; x<start_x; x++){
529
            buf[x + y*linesize]= buf[start_x + y*linesize];
530
        }
531

    
532
       //right
533
        for(x=end_x; x<block_w; x++){
534
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535
        }
536
    }
537
}
538

    
539
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540
{
541
    int i;
542

    
543
    /* read the pixels */
544
    for(i=0;i<8;i++) {
545
        block[0] = pixels[0];
546
        block[1] = pixels[1];
547
        block[2] = pixels[2];
548
        block[3] = pixels[3];
549
        block[4] = pixels[4];
550
        block[5] = pixels[5];
551
        block[6] = pixels[6];
552
        block[7] = pixels[7];
553
        pixels += line_size;
554
        block += 8;
555
    }
556
}
557

    
558
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559
                          const uint8_t *s2, int stride){
560
    int i;
561

    
562
    /* read the pixels */
563
    for(i=0;i<8;i++) {
564
        block[0] = s1[0] - s2[0];
565
        block[1] = s1[1] - s2[1];
566
        block[2] = s1[2] - s2[2];
567
        block[3] = s1[3] - s2[3];
568
        block[4] = s1[4] - s2[4];
569
        block[5] = s1[5] - s2[5];
570
        block[6] = s1[6] - s2[6];
571
        block[7] = s1[7] - s2[7];
572
        s1 += stride;
573
        s2 += stride;
574
        block += 8;
575
    }
576
}
577

    
578

    
579
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580
                                 int line_size)
581
{
582
    int i;
583
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584

    
585
    /* read the pixels */
586
    for(i=0;i<8;i++) {
587
        pixels[0] = cm[block[0]];
588
        pixels[1] = cm[block[1]];
589
        pixels[2] = cm[block[2]];
590
        pixels[3] = cm[block[3]];
591
        pixels[4] = cm[block[4]];
592
        pixels[5] = cm[block[5]];
593
        pixels[6] = cm[block[6]];
594
        pixels[7] = cm[block[7]];
595

    
596
        pixels += line_size;
597
        block += 8;
598
    }
599
}
600

    
601
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602
                                 int line_size)
603
{
604
    int i;
605
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606

    
607
    /* read the pixels */
608
    for(i=0;i<4;i++) {
609
        pixels[0] = cm[block[0]];
610
        pixels[1] = cm[block[1]];
611
        pixels[2] = cm[block[2]];
612
        pixels[3] = cm[block[3]];
613

    
614
        pixels += line_size;
615
        block += 8;
616
    }
617
}
618

    
619
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620
                                 int line_size)
621
{
622
    int i;
623
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624

    
625
    /* read the pixels */
626
    for(i=0;i<2;i++) {
627
        pixels[0] = cm[block[0]];
628
        pixels[1] = cm[block[1]];
629

    
630
        pixels += line_size;
631
        block += 8;
632
    }
633
}
634

    
635
static void put_signed_pixels_clamped_c(const DCTELEM *block,
636
                                        uint8_t *restrict pixels,
637
                                        int line_size)
638
{
639
    int i, j;
640

    
641
    for (i = 0; i < 8; i++) {
642
        for (j = 0; j < 8; j++) {
643
            if (*block < -128)
644
                *pixels = 0;
645
            else if (*block > 127)
646
                *pixels = 255;
647
            else
648
                *pixels = (uint8_t)(*block + 128);
649
            block++;
650
            pixels++;
651
        }
652
        pixels += (line_size - 8);
653
    }
654
}
655

    
656
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657
                                    int line_size)
658
{
659
    int i;
660

    
661
    /* read the pixels */
662
    for(i=0;i<8;i++) {
663
        pixels[0] = block[0];
664
        pixels[1] = block[1];
665
        pixels[2] = block[2];
666
        pixels[3] = block[3];
667
        pixels[4] = block[4];
668
        pixels[5] = block[5];
669
        pixels[6] = block[6];
670
        pixels[7] = block[7];
671

    
672
        pixels += line_size;
673
        block += 8;
674
    }
675
}
676

    
677
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
678
                          int line_size)
679
{
680
    int i;
681
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682

    
683
    /* read the pixels */
684
    for(i=0;i<8;i++) {
685
        pixels[0] = cm[pixels[0] + block[0]];
686
        pixels[1] = cm[pixels[1] + block[1]];
687
        pixels[2] = cm[pixels[2] + block[2]];
688
        pixels[3] = cm[pixels[3] + block[3]];
689
        pixels[4] = cm[pixels[4] + block[4]];
690
        pixels[5] = cm[pixels[5] + block[5]];
691
        pixels[6] = cm[pixels[6] + block[6]];
692
        pixels[7] = cm[pixels[7] + block[7]];
693
        pixels += line_size;
694
        block += 8;
695
    }
696
}
697

    
698
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
699
                          int line_size)
700
{
701
    int i;
702
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
703

    
704
    /* read the pixels */
705
    for(i=0;i<4;i++) {
706
        pixels[0] = cm[pixels[0] + block[0]];
707
        pixels[1] = cm[pixels[1] + block[1]];
708
        pixels[2] = cm[pixels[2] + block[2]];
709
        pixels[3] = cm[pixels[3] + block[3]];
710
        pixels += line_size;
711
        block += 8;
712
    }
713
}
714

    
715
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
716
                          int line_size)
717
{
718
    int i;
719
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
720

    
721
    /* read the pixels */
722
    for(i=0;i<2;i++) {
723
        pixels[0] = cm[pixels[0] + block[0]];
724
        pixels[1] = cm[pixels[1] + block[1]];
725
        pixels += line_size;
726
        block += 8;
727
    }
728
}
729

    
730
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
731
{
732
    int i;
733
    for(i=0;i<8;i++) {
734
        pixels[0] += block[0];
735
        pixels[1] += block[1];
736
        pixels[2] += block[2];
737
        pixels[3] += block[3];
738
        pixels[4] += block[4];
739
        pixels[5] += block[5];
740
        pixels[6] += block[6];
741
        pixels[7] += block[7];
742
        pixels += line_size;
743
        block += 8;
744
    }
745
}
746

    
747
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
748
{
749
    int i;
750
    for(i=0;i<4;i++) {
751
        pixels[0] += block[0];
752
        pixels[1] += block[1];
753
        pixels[2] += block[2];
754
        pixels[3] += block[3];
755
        pixels += line_size;
756
        block += 4;
757
    }
758
}
759

    
760
static int sum_abs_dctelem_c(DCTELEM *block)
761
{
762
    int sum=0, i;
763
    for(i=0; i<64; i++)
764
        sum+= FFABS(block[i]);
765
    return sum;
766
}
767

    
768
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
769
{
770
    int i;
771

    
772
    for (i = 0; i < h; i++) {
773
        memset(block, value, 16);
774
        block += line_size;
775
    }
776
}
777

    
778
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
779
{
780
    int i;
781

    
782
    for (i = 0; i < h; i++) {
783
        memset(block, value, 8);
784
        block += line_size;
785
    }
786
}
787

    
788
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
789
{
790
    int i, j;
791
    uint16_t *dst1 = dst;
792
    uint16_t *dst2 = dst + linesize;
793

    
794
    for (j = 0; j < 8; j++) {
795
        for (i = 0; i < 8; i++) {
796
            dst1[i] = dst2[i] = src[i] * 0x0101;
797
        }
798
        src  += 8;
799
        dst1 += linesize;
800
        dst2 += linesize;
801
    }
802
}
803

    
804
#if 0
805

806
#define PIXOP2(OPNAME, OP) \
807
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
808
{\
809
    int i;\
810
    for(i=0; i<h; i++){\
811
        OP(*((uint64_t*)block), AV_RN64(pixels));\
812
        pixels+=line_size;\
813
        block +=line_size;\
814
    }\
815
}\
816
\
817
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
818
{\
819
    int i;\
820
    for(i=0; i<h; i++){\
821
        const uint64_t a= AV_RN64(pixels  );\
822
        const uint64_t b= AV_RN64(pixels+1);\
823
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
824
        pixels+=line_size;\
825
        block +=line_size;\
826
    }\
827
}\
828
\
829
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
830
{\
831
    int i;\
832
    for(i=0; i<h; i++){\
833
        const uint64_t a= AV_RN64(pixels  );\
834
        const uint64_t b= AV_RN64(pixels+1);\
835
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
836
        pixels+=line_size;\
837
        block +=line_size;\
838
    }\
839
}\
840
\
841
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
842
{\
843
    int i;\
844
    for(i=0; i<h; i++){\
845
        const uint64_t a= AV_RN64(pixels          );\
846
        const uint64_t b= AV_RN64(pixels+line_size);\
847
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
848
        pixels+=line_size;\
849
        block +=line_size;\
850
    }\
851
}\
852
\
853
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
854
{\
855
    int i;\
856
    for(i=0; i<h; i++){\
857
        const uint64_t a= AV_RN64(pixels          );\
858
        const uint64_t b= AV_RN64(pixels+line_size);\
859
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
860
        pixels+=line_size;\
861
        block +=line_size;\
862
    }\
863
}\
864
\
865
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
866
{\
867
        int i;\
868
        const uint64_t a= AV_RN64(pixels  );\
869
        const uint64_t b= AV_RN64(pixels+1);\
870
        uint64_t l0=  (a&0x0303030303030303ULL)\
871
                    + (b&0x0303030303030303ULL)\
872
                    + 0x0202020202020202ULL;\
873
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
874
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
875
        uint64_t l1,h1;\
876
\
877
        pixels+=line_size;\
878
        for(i=0; i<h; i+=2){\
879
            uint64_t a= AV_RN64(pixels  );\
880
            uint64_t b= AV_RN64(pixels+1);\
881
            l1=  (a&0x0303030303030303ULL)\
882
               + (b&0x0303030303030303ULL);\
883
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
884
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
885
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
886
            pixels+=line_size;\
887
            block +=line_size;\
888
            a= AV_RN64(pixels  );\
889
            b= AV_RN64(pixels+1);\
890
            l0=  (a&0x0303030303030303ULL)\
891
               + (b&0x0303030303030303ULL)\
892
               + 0x0202020202020202ULL;\
893
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
894
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
895
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
896
            pixels+=line_size;\
897
            block +=line_size;\
898
        }\
899
}\
900
\
901
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
902
{\
903
        int i;\
904
        const uint64_t a= AV_RN64(pixels  );\
905
        const uint64_t b= AV_RN64(pixels+1);\
906
        uint64_t l0=  (a&0x0303030303030303ULL)\
907
                    + (b&0x0303030303030303ULL)\
908
                    + 0x0101010101010101ULL;\
909
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
910
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
911
        uint64_t l1,h1;\
912
\
913
        pixels+=line_size;\
914
        for(i=0; i<h; i+=2){\
915
            uint64_t a= AV_RN64(pixels  );\
916
            uint64_t b= AV_RN64(pixels+1);\
917
            l1=  (a&0x0303030303030303ULL)\
918
               + (b&0x0303030303030303ULL);\
919
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
920
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
921
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
922
            pixels+=line_size;\
923
            block +=line_size;\
924
            a= AV_RN64(pixels  );\
925
            b= AV_RN64(pixels+1);\
926
            l0=  (a&0x0303030303030303ULL)\
927
               + (b&0x0303030303030303ULL)\
928
               + 0x0101010101010101ULL;\
929
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
930
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
931
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
932
            pixels+=line_size;\
933
            block +=line_size;\
934
        }\
935
}\
936
\
937
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
938
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
939
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
940
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
941
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
942
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
943
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
944

945
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
946
#else // 64 bit variant
947

    
948
#define PIXOP2(OPNAME, OP) \
949
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
950
    int i;\
951
    for(i=0; i<h; i++){\
952
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
953
        pixels+=line_size;\
954
        block +=line_size;\
955
    }\
956
}\
957
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958
    int i;\
959
    for(i=0; i<h; i++){\
960
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
961
        pixels+=line_size;\
962
        block +=line_size;\
963
    }\
964
}\
965
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966
    int i;\
967
    for(i=0; i<h; i++){\
968
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
969
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
970
        pixels+=line_size;\
971
        block +=line_size;\
972
    }\
973
}\
974
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
975
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
976
}\
977
\
978
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
979
                                                int src_stride1, int src_stride2, int h){\
980
    int i;\
981
    for(i=0; i<h; i++){\
982
        uint32_t a,b;\
983
        a= AV_RN32(&src1[i*src_stride1  ]);\
984
        b= AV_RN32(&src2[i*src_stride2  ]);\
985
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
986
        a= AV_RN32(&src1[i*src_stride1+4]);\
987
        b= AV_RN32(&src2[i*src_stride2+4]);\
988
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
989
    }\
990
}\
991
\
992
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
993
                                                int src_stride1, int src_stride2, int h){\
994
    int i;\
995
    for(i=0; i<h; i++){\
996
        uint32_t a,b;\
997
        a= AV_RN32(&src1[i*src_stride1  ]);\
998
        b= AV_RN32(&src2[i*src_stride2  ]);\
999
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1000
        a= AV_RN32(&src1[i*src_stride1+4]);\
1001
        b= AV_RN32(&src2[i*src_stride2+4]);\
1002
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
1003
    }\
1004
}\
1005
\
1006
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1007
                                                int src_stride1, int src_stride2, int h){\
1008
    int i;\
1009
    for(i=0; i<h; i++){\
1010
        uint32_t a,b;\
1011
        a= AV_RN32(&src1[i*src_stride1  ]);\
1012
        b= AV_RN32(&src2[i*src_stride2  ]);\
1013
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1014
    }\
1015
}\
1016
\
1017
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1018
                                                int src_stride1, int src_stride2, int h){\
1019
    int i;\
1020
    for(i=0; i<h; i++){\
1021
        uint32_t a,b;\
1022
        a= AV_RN16(&src1[i*src_stride1  ]);\
1023
        b= AV_RN16(&src2[i*src_stride2  ]);\
1024
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1025
    }\
1026
}\
1027
\
1028
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1029
                                                int src_stride1, int src_stride2, int h){\
1030
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
1031
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1032
}\
1033
\
1034
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1035
                                                int src_stride1, int src_stride2, int h){\
1036
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
1037
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1038
}\
1039
\
1040
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1041
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1042
}\
1043
\
1044
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1045
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1046
}\
1047
\
1048
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1049
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1050
}\
1051
\
1052
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1053
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1054
}\
1055
\
1056
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1057
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1058
    int i;\
1059
    for(i=0; i<h; i++){\
1060
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1061
        a= AV_RN32(&src1[i*src_stride1]);\
1062
        b= AV_RN32(&src2[i*src_stride2]);\
1063
        c= AV_RN32(&src3[i*src_stride3]);\
1064
        d= AV_RN32(&src4[i*src_stride4]);\
1065
        l0=  (a&0x03030303UL)\
1066
           + (b&0x03030303UL)\
1067
           + 0x02020202UL;\
1068
        h0= ((a&0xFCFCFCFCUL)>>2)\
1069
          + ((b&0xFCFCFCFCUL)>>2);\
1070
        l1=  (c&0x03030303UL)\
1071
           + (d&0x03030303UL);\
1072
        h1= ((c&0xFCFCFCFCUL)>>2)\
1073
          + ((d&0xFCFCFCFCUL)>>2);\
1074
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1075
        a= AV_RN32(&src1[i*src_stride1+4]);\
1076
        b= AV_RN32(&src2[i*src_stride2+4]);\
1077
        c= AV_RN32(&src3[i*src_stride3+4]);\
1078
        d= AV_RN32(&src4[i*src_stride4+4]);\
1079
        l0=  (a&0x03030303UL)\
1080
           + (b&0x03030303UL)\
1081
           + 0x02020202UL;\
1082
        h0= ((a&0xFCFCFCFCUL)>>2)\
1083
          + ((b&0xFCFCFCFCUL)>>2);\
1084
        l1=  (c&0x03030303UL)\
1085
           + (d&0x03030303UL);\
1086
        h1= ((c&0xFCFCFCFCUL)>>2)\
1087
          + ((d&0xFCFCFCFCUL)>>2);\
1088
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089
    }\
1090
}\
1091
\
1092
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1093
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1094
}\
1095
\
1096
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1097
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1098
}\
1099
\
1100
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1101
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1102
}\
1103
\
1104
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1105
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1106
}\
1107
\
1108
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1109
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1110
    int i;\
1111
    for(i=0; i<h; i++){\
1112
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1113
        a= AV_RN32(&src1[i*src_stride1]);\
1114
        b= AV_RN32(&src2[i*src_stride2]);\
1115
        c= AV_RN32(&src3[i*src_stride3]);\
1116
        d= AV_RN32(&src4[i*src_stride4]);\
1117
        l0=  (a&0x03030303UL)\
1118
           + (b&0x03030303UL)\
1119
           + 0x01010101UL;\
1120
        h0= ((a&0xFCFCFCFCUL)>>2)\
1121
          + ((b&0xFCFCFCFCUL)>>2);\
1122
        l1=  (c&0x03030303UL)\
1123
           + (d&0x03030303UL);\
1124
        h1= ((c&0xFCFCFCFCUL)>>2)\
1125
          + ((d&0xFCFCFCFCUL)>>2);\
1126
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1127
        a= AV_RN32(&src1[i*src_stride1+4]);\
1128
        b= AV_RN32(&src2[i*src_stride2+4]);\
1129
        c= AV_RN32(&src3[i*src_stride3+4]);\
1130
        d= AV_RN32(&src4[i*src_stride4+4]);\
1131
        l0=  (a&0x03030303UL)\
1132
           + (b&0x03030303UL)\
1133
           + 0x01010101UL;\
1134
        h0= ((a&0xFCFCFCFCUL)>>2)\
1135
          + ((b&0xFCFCFCFCUL)>>2);\
1136
        l1=  (c&0x03030303UL)\
1137
           + (d&0x03030303UL);\
1138
        h1= ((c&0xFCFCFCFCUL)>>2)\
1139
          + ((d&0xFCFCFCFCUL)>>2);\
1140
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1141
    }\
1142
}\
1143
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1144
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1145
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1146
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1147
}\
1148
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1149
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1150
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1151
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1152
}\
1153
\
1154
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1155
{\
1156
        int i, a0, b0, a1, b1;\
1157
        a0= pixels[0];\
1158
        b0= pixels[1] + 2;\
1159
        a0 += b0;\
1160
        b0 += pixels[2];\
1161
\
1162
        pixels+=line_size;\
1163
        for(i=0; i<h; i+=2){\
1164
            a1= pixels[0];\
1165
            b1= pixels[1];\
1166
            a1 += b1;\
1167
            b1 += pixels[2];\
1168
\
1169
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1170
            block[1]= (b1+b0)>>2;\
1171
\
1172
            pixels+=line_size;\
1173
            block +=line_size;\
1174
\
1175
            a0= pixels[0];\
1176
            b0= pixels[1] + 2;\
1177
            a0 += b0;\
1178
            b0 += pixels[2];\
1179
\
1180
            block[0]= (a1+a0)>>2;\
1181
            block[1]= (b1+b0)>>2;\
1182
            pixels+=line_size;\
1183
            block +=line_size;\
1184
        }\
1185
}\
1186
\
1187
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1188
{\
1189
        int i;\
1190
        const uint32_t a= AV_RN32(pixels  );\
1191
        const uint32_t b= AV_RN32(pixels+1);\
1192
        uint32_t l0=  (a&0x03030303UL)\
1193
                    + (b&0x03030303UL)\
1194
                    + 0x02020202UL;\
1195
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1196
                   + ((b&0xFCFCFCFCUL)>>2);\
1197
        uint32_t l1,h1;\
1198
\
1199
        pixels+=line_size;\
1200
        for(i=0; i<h; i+=2){\
1201
            uint32_t a= AV_RN32(pixels  );\
1202
            uint32_t b= AV_RN32(pixels+1);\
1203
            l1=  (a&0x03030303UL)\
1204
               + (b&0x03030303UL);\
1205
            h1= ((a&0xFCFCFCFCUL)>>2)\
1206
              + ((b&0xFCFCFCFCUL)>>2);\
1207
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1208
            pixels+=line_size;\
1209
            block +=line_size;\
1210
            a= AV_RN32(pixels  );\
1211
            b= AV_RN32(pixels+1);\
1212
            l0=  (a&0x03030303UL)\
1213
               + (b&0x03030303UL)\
1214
               + 0x02020202UL;\
1215
            h0= ((a&0xFCFCFCFCUL)>>2)\
1216
              + ((b&0xFCFCFCFCUL)>>2);\
1217
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1218
            pixels+=line_size;\
1219
            block +=line_size;\
1220
        }\
1221
}\
1222
\
1223
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1224
{\
1225
    int j;\
1226
    for(j=0; j<2; j++){\
1227
        int i;\
1228
        const uint32_t a= AV_RN32(pixels  );\
1229
        const uint32_t b= AV_RN32(pixels+1);\
1230
        uint32_t l0=  (a&0x03030303UL)\
1231
                    + (b&0x03030303UL)\
1232
                    + 0x02020202UL;\
1233
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1234
                   + ((b&0xFCFCFCFCUL)>>2);\
1235
        uint32_t l1,h1;\
1236
\
1237
        pixels+=line_size;\
1238
        for(i=0; i<h; i+=2){\
1239
            uint32_t a= AV_RN32(pixels  );\
1240
            uint32_t b= AV_RN32(pixels+1);\
1241
            l1=  (a&0x03030303UL)\
1242
               + (b&0x03030303UL);\
1243
            h1= ((a&0xFCFCFCFCUL)>>2)\
1244
              + ((b&0xFCFCFCFCUL)>>2);\
1245
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1246
            pixels+=line_size;\
1247
            block +=line_size;\
1248
            a= AV_RN32(pixels  );\
1249
            b= AV_RN32(pixels+1);\
1250
            l0=  (a&0x03030303UL)\
1251
               + (b&0x03030303UL)\
1252
               + 0x02020202UL;\
1253
            h0= ((a&0xFCFCFCFCUL)>>2)\
1254
              + ((b&0xFCFCFCFCUL)>>2);\
1255
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1256
            pixels+=line_size;\
1257
            block +=line_size;\
1258
        }\
1259
        pixels+=4-line_size*(h+1);\
1260
        block +=4-line_size*h;\
1261
    }\
1262
}\
1263
\
1264
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1265
{\
1266
    int j;\
1267
    for(j=0; j<2; j++){\
1268
        int i;\
1269
        const uint32_t a= AV_RN32(pixels  );\
1270
        const uint32_t b= AV_RN32(pixels+1);\
1271
        uint32_t l0=  (a&0x03030303UL)\
1272
                    + (b&0x03030303UL)\
1273
                    + 0x01010101UL;\
1274
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1275
                   + ((b&0xFCFCFCFCUL)>>2);\
1276
        uint32_t l1,h1;\
1277
\
1278
        pixels+=line_size;\
1279
        for(i=0; i<h; i+=2){\
1280
            uint32_t a= AV_RN32(pixels  );\
1281
            uint32_t b= AV_RN32(pixels+1);\
1282
            l1=  (a&0x03030303UL)\
1283
               + (b&0x03030303UL);\
1284
            h1= ((a&0xFCFCFCFCUL)>>2)\
1285
              + ((b&0xFCFCFCFCUL)>>2);\
1286
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1287
            pixels+=line_size;\
1288
            block +=line_size;\
1289
            a= AV_RN32(pixels  );\
1290
            b= AV_RN32(pixels+1);\
1291
            l0=  (a&0x03030303UL)\
1292
               + (b&0x03030303UL)\
1293
               + 0x01010101UL;\
1294
            h0= ((a&0xFCFCFCFCUL)>>2)\
1295
              + ((b&0xFCFCFCFCUL)>>2);\
1296
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1297
            pixels+=line_size;\
1298
            block +=line_size;\
1299
        }\
1300
        pixels+=4-line_size*(h+1);\
1301
        block +=4-line_size*h;\
1302
    }\
1303
}\
1304
\
1305
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1306
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1307
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1308
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1309
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1310
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1311
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1312
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1313

    
1314
#define op_avg(a, b) a = rnd_avg32(a, b)
1315
#endif
1316
#define op_put(a, b) a = b
1317

    
1318
PIXOP2(avg, op_avg)
1319
PIXOP2(put, op_put)
1320
#undef op_avg
1321
#undef op_put
1322

    
1323
#define avg2(a,b) ((a+b+1)>>1)
1324
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1325

    
1326
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1327
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1328
}
1329

    
1330
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1331
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1332
}
1333

    
1334
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1335
{
1336
    const int A=(16-x16)*(16-y16);
1337
    const int B=(   x16)*(16-y16);
1338
    const int C=(16-x16)*(   y16);
1339
    const int D=(   x16)*(   y16);
1340
    int i;
1341

    
1342
    for(i=0; i<h; i++)
1343
    {
1344
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1345
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1346
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1347
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1348
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1349
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1350
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1351
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1352
        dst+= stride;
1353
        src+= stride;
1354
    }
1355
}
1356

    
1357
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1358
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1359
{
1360
    int y, vx, vy;
1361
    const int s= 1<<shift;
1362

    
1363
    width--;
1364
    height--;
1365

    
1366
    for(y=0; y<h; y++){
1367
        int x;
1368

    
1369
        vx= ox;
1370
        vy= oy;
1371
        for(x=0; x<8; x++){ //XXX FIXME optimize
1372
            int src_x, src_y, frac_x, frac_y, index;
1373

    
1374
            src_x= vx>>16;
1375
            src_y= vy>>16;
1376
            frac_x= src_x&(s-1);
1377
            frac_y= src_y&(s-1);
1378
            src_x>>=shift;
1379
            src_y>>=shift;
1380

    
1381
            if((unsigned)src_x < width){
1382
                if((unsigned)src_y < height){
1383
                    index= src_x + src_y*stride;
1384
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1385
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1386
                                        + (  src[index+stride  ]*(s-frac_x)
1387
                                           + src[index+stride+1]*   frac_x )*   frac_y
1388
                                        + r)>>(shift*2);
1389
                }else{
1390
                    index= src_x + av_clip(src_y, 0, height)*stride;
1391
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1392
                                          + src[index       +1]*   frac_x )*s
1393
                                        + r)>>(shift*2);
1394
                }
1395
            }else{
1396
                if((unsigned)src_y < height){
1397
                    index= av_clip(src_x, 0, width) + src_y*stride;
1398
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1399
                                           + src[index+stride  ]*   frac_y )*s
1400
                                        + r)>>(shift*2);
1401
                }else{
1402
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1403
                    dst[y*stride + x]=    src[index         ];
1404
                }
1405
            }
1406

    
1407
            vx+= dxx;
1408
            vy+= dyx;
1409
        }
1410
        ox += dxy;
1411
        oy += dyy;
1412
    }
1413
}
1414

    
1415
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416
    switch(width){
1417
    case 2: put_pixels2_c (dst, src, stride, height); break;
1418
    case 4: put_pixels4_c (dst, src, stride, height); break;
1419
    case 8: put_pixels8_c (dst, src, stride, height); break;
1420
    case 16:put_pixels16_c(dst, src, stride, height); break;
1421
    }
1422
}
1423

    
1424
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425
    int i,j;
1426
    for (i=0; i < height; i++) {
1427
      for (j=0; j < width; j++) {
1428
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1429
      }
1430
      src += stride;
1431
      dst += stride;
1432
    }
1433
}
1434

    
1435
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436
    int i,j;
1437
    for (i=0; i < height; i++) {
1438
      for (j=0; j < width; j++) {
1439
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1440
      }
1441
      src += stride;
1442
      dst += stride;
1443
    }
1444
}
1445

    
1446
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447
    int i,j;
1448
    for (i=0; i < height; i++) {
1449
      for (j=0; j < width; j++) {
1450
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1451
      }
1452
      src += stride;
1453
      dst += stride;
1454
    }
1455
}
1456

    
1457
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458
    int i,j;
1459
    for (i=0; i < height; i++) {
1460
      for (j=0; j < width; j++) {
1461
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1462
      }
1463
      src += stride;
1464
      dst += stride;
1465
    }
1466
}
1467

    
1468
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469
    int i,j;
1470
    for (i=0; i < height; i++) {
1471
      for (j=0; j < width; j++) {
1472
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1473
      }
1474
      src += stride;
1475
      dst += stride;
1476
    }
1477
}
1478

    
1479
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480
    int i,j;
1481
    for (i=0; i < height; i++) {
1482
      for (j=0; j < width; j++) {
1483
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1484
      }
1485
      src += stride;
1486
      dst += stride;
1487
    }
1488
}
1489

    
1490
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491
    int i,j;
1492
    for (i=0; i < height; i++) {
1493
      for (j=0; j < width; j++) {
1494
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1495
      }
1496
      src += stride;
1497
      dst += stride;
1498
    }
1499
}
1500

    
1501
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502
    int i,j;
1503
    for (i=0; i < height; i++) {
1504
      for (j=0; j < width; j++) {
1505
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1506
      }
1507
      src += stride;
1508
      dst += stride;
1509
    }
1510
}
1511

    
1512
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513
    switch(width){
1514
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1515
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1516
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1517
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1518
    }
1519
}
1520

    
1521
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522
    int i,j;
1523
    for (i=0; i < height; i++) {
1524
      for (j=0; j < width; j++) {
1525
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1526
      }
1527
      src += stride;
1528
      dst += stride;
1529
    }
1530
}
1531

    
1532
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1533
    int i,j;
1534
    for (i=0; i < height; i++) {
1535
      for (j=0; j < width; j++) {
1536
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1537
      }
1538
      src += stride;
1539
      dst += stride;
1540
    }
1541
}
1542

    
1543
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1544
    int i,j;
1545
    for (i=0; i < height; i++) {
1546
      for (j=0; j < width; j++) {
1547
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1548
      }
1549
      src += stride;
1550
      dst += stride;
1551
    }
1552
}
1553

    
1554
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1555
    int i,j;
1556
    for (i=0; i < height; i++) {
1557
      for (j=0; j < width; j++) {
1558
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1559
      }
1560
      src += stride;
1561
      dst += stride;
1562
    }
1563
}
1564

    
1565
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1566
    int i,j;
1567
    for (i=0; i < height; i++) {
1568
      for (j=0; j < width; j++) {
1569
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1570
      }
1571
      src += stride;
1572
      dst += stride;
1573
    }
1574
}
1575

    
1576
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1577
    int i,j;
1578
    for (i=0; i < height; i++) {
1579
      for (j=0; j < width; j++) {
1580
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1581
      }
1582
      src += stride;
1583
      dst += stride;
1584
    }
1585
}
1586

    
1587
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1588
    int i,j;
1589
    for (i=0; i < height; i++) {
1590
      for (j=0; j < width; j++) {
1591
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1592
      }
1593
      src += stride;
1594
      dst += stride;
1595
    }
1596
}
1597

    
1598
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1599
    int i,j;
1600
    for (i=0; i < height; i++) {
1601
      for (j=0; j < width; j++) {
1602
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1603
      }
1604
      src += stride;
1605
      dst += stride;
1606
    }
1607
}
1608
#if 0
1609
#define TPEL_WIDTH(width)\
1610
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1611
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1612
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1613
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1614
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1615
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1616
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1617
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1618
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1620
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1621
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1622
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1623
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1624
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1625
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1626
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1627
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1628
#endif
1629

    
1630
#define H264_CHROMA_MC(OPNAME, OP)\
1631
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1632
    const int A=(8-x)*(8-y);\
1633
    const int B=(  x)*(8-y);\
1634
    const int C=(8-x)*(  y);\
1635
    const int D=(  x)*(  y);\
1636
    int i;\
1637
    \
1638
    assert(x<8 && y<8 && x>=0 && y>=0);\
1639
\
1640
    if(D){\
1641
        for(i=0; i<h; i++){\
1642
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1643
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1644
            dst+= stride;\
1645
            src+= stride;\
1646
        }\
1647
    }else{\
1648
        const int E= B+C;\
1649
        const int step= C ? stride : 1;\
1650
        for(i=0; i<h; i++){\
1651
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1652
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1653
            dst+= stride;\
1654
            src+= stride;\
1655
        }\
1656
    }\
1657
}\
1658
\
1659
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1660
    const int A=(8-x)*(8-y);\
1661
    const int B=(  x)*(8-y);\
1662
    const int C=(8-x)*(  y);\
1663
    const int D=(  x)*(  y);\
1664
    int i;\
1665
    \
1666
    assert(x<8 && y<8 && x>=0 && y>=0);\
1667
\
1668
    if(D){\
1669
        for(i=0; i<h; i++){\
1670
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1671
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1672
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1673
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1674
            dst+= stride;\
1675
            src+= stride;\
1676
        }\
1677
    }else{\
1678
        const int E= B+C;\
1679
        const int step= C ? stride : 1;\
1680
        for(i=0; i<h; i++){\
1681
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1682
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1683
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1684
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1685
            dst+= stride;\
1686
            src+= stride;\
1687
        }\
1688
    }\
1689
}\
1690
\
1691
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1692
    const int A=(8-x)*(8-y);\
1693
    const int B=(  x)*(8-y);\
1694
    const int C=(8-x)*(  y);\
1695
    const int D=(  x)*(  y);\
1696
    int i;\
1697
    \
1698
    assert(x<8 && y<8 && x>=0 && y>=0);\
1699
\
1700
    if(D){\
1701
        for(i=0; i<h; i++){\
1702
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1703
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1704
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1705
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1706
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1707
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1708
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1709
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1710
            dst+= stride;\
1711
            src+= stride;\
1712
        }\
1713
    }else{\
1714
        const int E= B+C;\
1715
        const int step= C ? stride : 1;\
1716
        for(i=0; i<h; i++){\
1717
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1718
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1719
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1720
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1721
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1722
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1723
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1724
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1725
            dst+= stride;\
1726
            src+= stride;\
1727
        }\
1728
    }\
1729
}
1730

    
1731
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1732
#define op_put(a, b) a = (((b) + 32)>>6)
1733

    
1734
H264_CHROMA_MC(put_       , op_put)
1735
H264_CHROMA_MC(avg_       , op_avg)
1736
#undef op_avg
1737
#undef op_put
1738

    
1739
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1740
    const int A=(8-x)*(8-y);
1741
    const int B=(  x)*(8-y);
1742
    const int C=(8-x)*(  y);
1743
    const int D=(  x)*(  y);
1744
    int i;
1745

    
1746
    assert(x<8 && y<8 && x>=0 && y>=0);
1747

    
1748
    for(i=0; i<h; i++)
1749
    {
1750
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1751
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1752
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1753
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1754
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1755
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1756
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1757
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1758
        dst+= stride;
1759
        src+= stride;
1760
    }
1761
}
1762

    
1763
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1764
    const int A=(8-x)*(8-y);
1765
    const int B=(  x)*(8-y);
1766
    const int C=(8-x)*(  y);
1767
    const int D=(  x)*(  y);
1768
    int i;
1769

    
1770
    assert(x<8 && y<8 && x>=0 && y>=0);
1771

    
1772
    for(i=0; i<h; i++)
1773
    {
1774
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1775
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1776
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1777
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1778
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1779
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1780
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1781
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1782
        dst+= stride;
1783
        src+= stride;
1784
    }
1785
}
1786

    
1787
#define QPEL_MC(r, OPNAME, RND, OP) \
1788
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1789
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1790
    int i;\
1791
    for(i=0; i<h; i++)\
1792
    {\
1793
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1794
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1795
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1796
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1797
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1798
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1799
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1800
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1801
        dst+=dstStride;\
1802
        src+=srcStride;\
1803
    }\
1804
}\
1805
\
1806
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1807
    const int w=8;\
1808
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1809
    int i;\
1810
    for(i=0; i<w; i++)\
1811
    {\
1812
        const int src0= src[0*srcStride];\
1813
        const int src1= src[1*srcStride];\
1814
        const int src2= src[2*srcStride];\
1815
        const int src3= src[3*srcStride];\
1816
        const int src4= src[4*srcStride];\
1817
        const int src5= src[5*srcStride];\
1818
        const int src6= src[6*srcStride];\
1819
        const int src7= src[7*srcStride];\
1820
        const int src8= src[8*srcStride];\
1821
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1822
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1823
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1824
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1825
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1826
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1827
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1828
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1829
        dst++;\
1830
        src++;\
1831
    }\
1832
}\
1833
\
1834
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1835
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1836
    int i;\
1837
    \
1838
    for(i=0; i<h; i++)\
1839
    {\
1840
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1841
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1842
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1843
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1844
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1845
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1846
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1847
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1848
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1849
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1850
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1851
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1852
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1853
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1854
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1855
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1856
        dst+=dstStride;\
1857
        src+=srcStride;\
1858
    }\
1859
}\
1860
\
1861
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1862
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1863
    int i;\
1864
    const int w=16;\
1865
    for(i=0; i<w; i++)\
1866
    {\
1867
        const int src0= src[0*srcStride];\
1868
        const int src1= src[1*srcStride];\
1869
        const int src2= src[2*srcStride];\
1870
        const int src3= src[3*srcStride];\
1871
        const int src4= src[4*srcStride];\
1872
        const int src5= src[5*srcStride];\
1873
        const int src6= src[6*srcStride];\
1874
        const int src7= src[7*srcStride];\
1875
        const int src8= src[8*srcStride];\
1876
        const int src9= src[9*srcStride];\
1877
        const int src10= src[10*srcStride];\
1878
        const int src11= src[11*srcStride];\
1879
        const int src12= src[12*srcStride];\
1880
        const int src13= src[13*srcStride];\
1881
        const int src14= src[14*srcStride];\
1882
        const int src15= src[15*srcStride];\
1883
        const int src16= src[16*srcStride];\
1884
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1885
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1886
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1887
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1888
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1889
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1890
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1891
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1892
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1893
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1894
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1895
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1896
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1897
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1898
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1899
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1900
        dst++;\
1901
        src++;\
1902
    }\
1903
}\
1904
\
1905
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1906
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1907
}\
1908
\
1909
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1910
    uint8_t half[64];\
1911
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1912
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1913
}\
1914
\
1915
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1916
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1917
}\
1918
\
1919
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1920
    uint8_t half[64];\
1921
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1922
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1923
}\
1924
\
1925
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1926
    uint8_t full[16*9];\
1927
    uint8_t half[64];\
1928
    copy_block9(full, src, 16, stride, 9);\
1929
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1930
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1931
}\
1932
\
1933
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1934
    uint8_t full[16*9];\
1935
    copy_block9(full, src, 16, stride, 9);\
1936
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1937
}\
1938
\
1939
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1940
    uint8_t full[16*9];\
1941
    uint8_t half[64];\
1942
    copy_block9(full, src, 16, stride, 9);\
1943
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1944
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1945
}\
1946
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1947
    uint8_t full[16*9];\
1948
    uint8_t halfH[72];\
1949
    uint8_t halfV[64];\
1950
    uint8_t halfHV[64];\
1951
    copy_block9(full, src, 16, stride, 9);\
1952
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1953
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1954
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1955
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1956
}\
1957
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1958
    uint8_t full[16*9];\
1959
    uint8_t halfH[72];\
1960
    uint8_t halfHV[64];\
1961
    copy_block9(full, src, 16, stride, 9);\
1962
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1964
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1966
}\
1967
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1968
    uint8_t full[16*9];\
1969
    uint8_t halfH[72];\
1970
    uint8_t halfV[64];\
1971
    uint8_t halfHV[64];\
1972
    copy_block9(full, src, 16, stride, 9);\
1973
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1974
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1975
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1976
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1977
}\
1978
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1979
    uint8_t full[16*9];\
1980
    uint8_t halfH[72];\
1981
    uint8_t halfHV[64];\
1982
    copy_block9(full, src, 16, stride, 9);\
1983
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1984
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1985
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1986
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1987
}\
1988
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1989
    uint8_t full[16*9];\
1990
    uint8_t halfH[72];\
1991
    uint8_t halfV[64];\
1992
    uint8_t halfHV[64];\
1993
    copy_block9(full, src, 16, stride, 9);\
1994
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1995
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1996
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1997
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1998
}\
1999
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2000
    uint8_t full[16*9];\
2001
    uint8_t halfH[72];\
2002
    uint8_t halfHV[64];\
2003
    copy_block9(full, src, 16, stride, 9);\
2004
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2005
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2006
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2007
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2008
}\
2009
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    uint8_t full[16*9];\
2011
    uint8_t halfH[72];\
2012
    uint8_t halfV[64];\
2013
    uint8_t halfHV[64];\
2014
    copy_block9(full, src, 16, stride, 9);\
2015
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
2016
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2019
}\
2020
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t full[16*9];\
2022
    uint8_t halfH[72];\
2023
    uint8_t halfHV[64];\
2024
    copy_block9(full, src, 16, stride, 9);\
2025
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2026
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2027
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2028
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2029
}\
2030
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2031
    uint8_t halfH[72];\
2032
    uint8_t halfHV[64];\
2033
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2034
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2035
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2036
}\
2037
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2038
    uint8_t halfH[72];\
2039
    uint8_t halfHV[64];\
2040
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2041
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2042
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2043
}\
2044
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2045
    uint8_t full[16*9];\
2046
    uint8_t halfH[72];\
2047
    uint8_t halfV[64];\
2048
    uint8_t halfHV[64];\
2049
    copy_block9(full, src, 16, stride, 9);\
2050
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2051
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2052
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2053
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2054
}\
2055
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2056
    uint8_t full[16*9];\
2057
    uint8_t halfH[72];\
2058
    copy_block9(full, src, 16, stride, 9);\
2059
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2060
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2061
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2062
}\
2063
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2064
    uint8_t full[16*9];\
2065
    uint8_t halfH[72];\
2066
    uint8_t halfV[64];\
2067
    uint8_t halfHV[64];\
2068
    copy_block9(full, src, 16, stride, 9);\
2069
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2070
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2071
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2072
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2073
}\
2074
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2075
    uint8_t full[16*9];\
2076
    uint8_t halfH[72];\
2077
    copy_block9(full, src, 16, stride, 9);\
2078
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2079
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2080
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2081
}\
2082
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2083
    uint8_t halfH[72];\
2084
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2085
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2086
}\
2087
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2088
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2089
}\
2090
\
2091
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2092
    uint8_t half[256];\
2093
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2094
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2095
}\
2096
\
2097
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2098
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2099
}\
2100
\
2101
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2102
    uint8_t half[256];\
2103
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2104
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2105
}\
2106
\
2107
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2108
    uint8_t full[24*17];\
2109
    uint8_t half[256];\
2110
    copy_block17(full, src, 24, stride, 17);\
2111
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2112
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2113
}\
2114
\
2115
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2116
    uint8_t full[24*17];\
2117
    copy_block17(full, src, 24, stride, 17);\
2118
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2119
}\
2120
\
2121
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2122
    uint8_t full[24*17];\
2123
    uint8_t half[256];\
2124
    copy_block17(full, src, 24, stride, 17);\
2125
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2126
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2127
}\
2128
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2129
    uint8_t full[24*17];\
2130
    uint8_t halfH[272];\
2131
    uint8_t halfV[256];\
2132
    uint8_t halfHV[256];\
2133
    copy_block17(full, src, 24, stride, 17);\
2134
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2135
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2136
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2137
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2138
}\
2139
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2140
    uint8_t full[24*17];\
2141
    uint8_t halfH[272];\
2142
    uint8_t halfHV[256];\
2143
    copy_block17(full, src, 24, stride, 17);\
2144
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2146
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2148
}\
2149
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2150
    uint8_t full[24*17];\
2151
    uint8_t halfH[272];\
2152
    uint8_t halfV[256];\
2153
    uint8_t halfHV[256];\
2154
    copy_block17(full, src, 24, stride, 17);\
2155
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2156
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2157
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2158
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2159
}\
2160
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2161
    uint8_t full[24*17];\
2162
    uint8_t halfH[272];\
2163
    uint8_t halfHV[256];\
2164
    copy_block17(full, src, 24, stride, 17);\
2165
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2166
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2167
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2168
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2169
}\
2170
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2171
    uint8_t full[24*17];\
2172
    uint8_t halfH[272];\
2173
    uint8_t halfV[256];\
2174
    uint8_t halfHV[256];\
2175
    copy_block17(full, src, 24, stride, 17);\
2176
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2177
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2178
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2179
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2180
}\
2181
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2182
    uint8_t full[24*17];\
2183
    uint8_t halfH[272];\
2184
    uint8_t halfHV[256];\
2185
    copy_block17(full, src, 24, stride, 17);\
2186
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2187
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2188
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2189
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2190
}\
2191
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192
    uint8_t full[24*17];\
2193
    uint8_t halfH[272];\
2194
    uint8_t halfV[256];\
2195
    uint8_t halfHV[256];\
2196
    copy_block17(full, src, 24, stride, 17);\
2197
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2198
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2201
}\
2202
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2203
    uint8_t full[24*17];\
2204
    uint8_t halfH[272];\
2205
    uint8_t halfHV[256];\
2206
    copy_block17(full, src, 24, stride, 17);\
2207
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2208
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2209
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2210
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2211
}\
2212
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2213
    uint8_t halfH[272];\
2214
    uint8_t halfHV[256];\
2215
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2216
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2217
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2218
}\
2219
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2220
    uint8_t halfH[272];\
2221
    uint8_t halfHV[256];\
2222
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2223
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2224
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2225
}\
2226
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2227
    uint8_t full[24*17];\
2228
    uint8_t halfH[272];\
2229
    uint8_t halfV[256];\
2230
    uint8_t halfHV[256];\
2231
    copy_block17(full, src, 24, stride, 17);\
2232
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2233
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2234
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2235
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2236
}\
2237
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2238
    uint8_t full[24*17];\
2239
    uint8_t halfH[272];\
2240
    copy_block17(full, src, 24, stride, 17);\
2241
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2242
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2243
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2244
}\
2245
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2246
    uint8_t full[24*17];\
2247
    uint8_t halfH[272];\
2248
    uint8_t halfV[256];\
2249
    uint8_t halfHV[256];\
2250
    copy_block17(full, src, 24, stride, 17);\
2251
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2252
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2253
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2254
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2255
}\
2256
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2257
    uint8_t full[24*17];\
2258
    uint8_t halfH[272];\
2259
    copy_block17(full, src, 24, stride, 17);\
2260
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2261
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2262
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2263
}\
2264
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2265
    uint8_t halfH[272];\
2266
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2267
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2268
}
2269

    
2270
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2271
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2272
#define op_put(a, b) a = cm[((b) + 16)>>5]
2273
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2274

    
2275
QPEL_MC(0, put_       , _       , op_put)
2276
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2277
QPEL_MC(0, avg_       , _       , op_avg)
2278
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2279
#undef op_avg
2280
#undef op_avg_no_rnd
2281
#undef op_put
2282
#undef op_put_no_rnd
2283

    
2284
#if 1
2285
#define H264_LOWPASS(OPNAME, OP, OP2) \
2286
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2287
    const int h=2;\
2288
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2289
    int i;\
2290
    for(i=0; i<h; i++)\
2291
    {\
2292
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2293
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2294
        dst+=dstStride;\
2295
        src+=srcStride;\
2296
    }\
2297
}\
2298
\
2299
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300
    const int w=2;\
2301
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2302
    int i;\
2303
    for(i=0; i<w; i++)\
2304
    {\
2305
        const int srcB= src[-2*srcStride];\
2306
        const int srcA= src[-1*srcStride];\
2307
        const int src0= src[0 *srcStride];\
2308
        const int src1= src[1 *srcStride];\
2309
        const int src2= src[2 *srcStride];\
2310
        const int src3= src[3 *srcStride];\
2311
        const int src4= src[4 *srcStride];\
2312
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2313
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2314
        dst++;\
2315
        src++;\
2316
    }\
2317
}\
2318
\
2319
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2320
    const int h=2;\
2321
    const int w=2;\
2322
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2323
    int i;\
2324
    src -= 2*srcStride;\
2325
    for(i=0; i<h+5; i++)\
2326
    {\
2327
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2328
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2329
        tmp+=tmpStride;\
2330
        src+=srcStride;\
2331
    }\
2332
    tmp -= tmpStride*(h+5-2);\
2333
    for(i=0; i<w; i++)\
2334
    {\
2335
        const int tmpB= tmp[-2*tmpStride];\
2336
        const int tmpA= tmp[-1*tmpStride];\
2337
        const int tmp0= tmp[0 *tmpStride];\
2338
        const int tmp1= tmp[1 *tmpStride];\
2339
        const int tmp2= tmp[2 *tmpStride];\
2340
        const int tmp3= tmp[3 *tmpStride];\
2341
        const int tmp4= tmp[4 *tmpStride];\
2342
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2343
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2344
        dst++;\
2345
        tmp++;\
2346
    }\
2347
}\
2348
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2349
    const int h=4;\
2350
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2351
    int i;\
2352
    for(i=0; i<h; i++)\
2353
    {\
2354
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2355
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2356
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2357
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2358
        dst+=dstStride;\
2359
        src+=srcStride;\
2360
    }\
2361
}\
2362
\
2363
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2364
    const int w=4;\
2365
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2366
    int i;\
2367
    for(i=0; i<w; i++)\
2368
    {\
2369
        const int srcB= src[-2*srcStride];\
2370
        const int srcA= src[-1*srcStride];\
2371
        const int src0= src[0 *srcStride];\
2372
        const int src1= src[1 *srcStride];\
2373
        const int src2= src[2 *srcStride];\
2374
        const int src3= src[3 *srcStride];\
2375
        const int src4= src[4 *srcStride];\
2376
        const int src5= src[5 *srcStride];\
2377
        const int src6= src[6 *srcStride];\
2378
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2379
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2380
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2381
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2382
        dst++;\
2383
        src++;\
2384
    }\
2385
}\
2386
\
2387
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2388
    const int h=4;\
2389
    const int w=4;\
2390
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2391
    int i;\
2392
    src -= 2*srcStride;\
2393
    for(i=0; i<h+5; i++)\
2394
    {\
2395
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2396
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2397
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2398
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2399
        tmp+=tmpStride;\
2400
        src+=srcStride;\
2401
    }\
2402
    tmp -= tmpStride*(h+5-2);\
2403
    for(i=0; i<w; i++)\
2404
    {\
2405
        const int tmpB= tmp[-2*tmpStride];\
2406
        const int tmpA= tmp[-1*tmpStride];\
2407
        const int tmp0= tmp[0 *tmpStride];\
2408
        const int tmp1= tmp[1 *tmpStride];\
2409
        const int tmp2= tmp[2 *tmpStride];\
2410
        const int tmp3= tmp[3 *tmpStride];\
2411
        const int tmp4= tmp[4 *tmpStride];\
2412
        const int tmp5= tmp[5 *tmpStride];\
2413
        const int tmp6= tmp[6 *tmpStride];\
2414
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2415
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2416
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2417
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2418
        dst++;\
2419
        tmp++;\
2420
    }\
2421
}\
2422
\
2423
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424
    const int h=8;\
2425
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2426
    int i;\
2427
    for(i=0; i<h; i++)\
2428
    {\
2429
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2430
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2431
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2432
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2433
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2434
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2435
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2436
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2437
        dst+=dstStride;\
2438
        src+=srcStride;\
2439
    }\
2440
}\
2441
\
2442
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2443
    const int w=8;\
2444
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2445
    int i;\
2446
    for(i=0; i<w; i++)\
2447
    {\
2448
        const int srcB= src[-2*srcStride];\
2449
        const int srcA= src[-1*srcStride];\
2450
        const int src0= src[0 *srcStride];\
2451
        const int src1= src[1 *srcStride];\
2452
        const int src2= src[2 *srcStride];\
2453
        const int src3= src[3 *srcStride];\
2454
        const int src4= src[4 *srcStride];\
2455
        const int src5= src[5 *srcStride];\
2456
        const int src6= src[6 *srcStride];\
2457
        const int src7= src[7 *srcStride];\
2458
        const int src8= src[8 *srcStride];\
2459
        const int src9= src[9 *srcStride];\
2460
        const int src10=src[10*srcStride];\
2461
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2462
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2463
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2464
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2465
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2466
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2467
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2468
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2469
        dst++;\
2470
        src++;\
2471
    }\
2472
}\
2473
\
2474
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2475
    const int h=8;\
2476
    const int w=8;\
2477
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2478
    int i;\
2479
    src -= 2*srcStride;\
2480
    for(i=0; i<h+5; i++)\
2481
    {\
2482
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2483
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2484
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2485
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2486
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2487
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2488
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2489
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2490
        tmp+=tmpStride;\
2491
        src+=srcStride;\
2492
    }\
2493
    tmp -= tmpStride*(h+5-2);\
2494
    for(i=0; i<w; i++)\
2495
    {\
2496
        const int tmpB= tmp[-2*tmpStride];\
2497
        const int tmpA= tmp[-1*tmpStride];\
2498
        const int tmp0= tmp[0 *tmpStride];\
2499
        const int tmp1= tmp[1 *tmpStride];\
2500
        const int tmp2= tmp[2 *tmpStride];\
2501
        const int tmp3= tmp[3 *tmpStride];\
2502
        const int tmp4= tmp[4 *tmpStride];\
2503
        const int tmp5= tmp[5 *tmpStride];\
2504
        const int tmp6= tmp[6 *tmpStride];\
2505
        const int tmp7= tmp[7 *tmpStride];\
2506
        const int tmp8= tmp[8 *tmpStride];\
2507
        const int tmp9= tmp[9 *tmpStride];\
2508
        const int tmp10=tmp[10*tmpStride];\
2509
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2510
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2511
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2512
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2513
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2514
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2515
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2516
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2517
        dst++;\
2518
        tmp++;\
2519
    }\
2520
}\
2521
\
2522
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2523
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2524
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2525
    src += 8*srcStride;\
2526
    dst += 8*dstStride;\
2527
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2528
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2529
}\
2530
\
2531
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2532
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2533
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2534
    src += 8*srcStride;\
2535
    dst += 8*dstStride;\
2536
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2537
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2538
}\
2539
\
2540
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2541
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2542
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2543
    src += 8*srcStride;\
2544
    dst += 8*dstStride;\
2545
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2546
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2547
}\
2548

    
2549
#define H264_MC(OPNAME, SIZE) \
2550
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2551
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2552
}\
2553
\
2554
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2555
    uint8_t half[SIZE*SIZE];\
2556
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2557
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2558
}\
2559
\
2560
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2561
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2562
}\
2563
\
2564
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2565
    uint8_t half[SIZE*SIZE];\
2566
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2567
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2568
}\
2569
\
2570
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2571
    uint8_t full[SIZE*(SIZE+5)];\
2572
    uint8_t * const full_mid= full + SIZE*2;\
2573
    uint8_t half[SIZE*SIZE];\
2574
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2575
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2576
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2577
}\
2578
\
2579
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2580
    uint8_t full[SIZE*(SIZE+5)];\
2581
    uint8_t * const full_mid= full + SIZE*2;\
2582
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2583
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2584
}\
2585
\
2586
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2587
    uint8_t full[SIZE*(SIZE+5)];\
2588
    uint8_t * const full_mid= full + SIZE*2;\
2589
    uint8_t half[SIZE*SIZE];\
2590
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2591
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2592
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2593
}\
2594
\
2595
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2596
    uint8_t full[SIZE*(SIZE+5)];\
2597
    uint8_t * const full_mid= full + SIZE*2;\
2598
    uint8_t halfH[SIZE*SIZE];\
2599
    uint8_t halfV[SIZE*SIZE];\
2600
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2601
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2602
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2603
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2604
}\
2605
\
2606
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2607
    uint8_t full[SIZE*(SIZE+5)];\
2608
    uint8_t * const full_mid= full + SIZE*2;\
2609
    uint8_t halfH[SIZE*SIZE];\
2610
    uint8_t halfV[SIZE*SIZE];\
2611
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2612
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2613
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2614
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2615
}\
2616
\
2617
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2618
    uint8_t full[SIZE*(SIZE+5)];\
2619
    uint8_t * const full_mid= full + SIZE*2;\
2620
    uint8_t halfH[SIZE*SIZE];\
2621
    uint8_t halfV[SIZE*SIZE];\
2622
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2623
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2624
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2626
}\
2627
\
2628
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2629
    uint8_t full[SIZE*(SIZE+5)];\
2630
    uint8_t * const full_mid= full + SIZE*2;\
2631
    uint8_t halfH[SIZE*SIZE];\
2632
    uint8_t halfV[SIZE*SIZE];\
2633
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2634
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2635
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2636
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2637
}\
2638
\
2639
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2640
    int16_t tmp[SIZE*(SIZE+5)];\
2641
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2642
}\
2643
\
2644
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2645
    int16_t tmp[SIZE*(SIZE+5)];\
2646
    uint8_t halfH[SIZE*SIZE];\
2647
    uint8_t halfHV[SIZE*SIZE];\
2648
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2649
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2650
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2651
}\
2652
\
2653
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2654
    int16_t tmp[SIZE*(SIZE+5)];\
2655
    uint8_t halfH[SIZE*SIZE];\
2656
    uint8_t halfHV[SIZE*SIZE];\
2657
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2658
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2659
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2660
}\
2661
\
2662
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2663
    uint8_t full[SIZE*(SIZE+5)];\
2664
    uint8_t * const full_mid= full + SIZE*2;\
2665
    int16_t tmp[SIZE*(SIZE+5)];\
2666
    uint8_t halfV[SIZE*SIZE];\
2667
    uint8_t halfHV[SIZE*SIZE];\
2668
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2669
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2670
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2671
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2672
}\
2673
\
2674
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2675
    uint8_t full[SIZE*(SIZE+5)];\
2676
    uint8_t * const full_mid= full + SIZE*2;\
2677
    int16_t tmp[SIZE*(SIZE+5)];\
2678
    uint8_t halfV[SIZE*SIZE];\
2679
    uint8_t halfHV[SIZE*SIZE];\
2680
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2681
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2682
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2683
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2684
}\
2685

    
2686
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2687
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2688
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2689
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2690
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2691

    
2692
H264_LOWPASS(put_       , op_put, op2_put)
2693
H264_LOWPASS(avg_       , op_avg, op2_avg)
2694
H264_MC(put_, 2)
2695
H264_MC(put_, 4)
2696
H264_MC(put_, 8)
2697
H264_MC(put_, 16)
2698
H264_MC(avg_, 4)
2699
H264_MC(avg_, 8)
2700
H264_MC(avg_, 16)
2701

    
2702
#undef op_avg
2703
#undef op_put
2704
#undef op2_avg
2705
#undef op2_put
2706
#endif
2707

    
2708
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2709
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2710
#define H264_WEIGHT(W,H) \
2711
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2712
    int y; \
2713
    offset <<= log2_denom; \
2714
    if(log2_denom) offset += 1<<(log2_denom-1); \
2715
    for(y=0; y<H; y++, block += stride){ \
2716
        op_scale1(0); \
2717
        op_scale1(1); \
2718
        if(W==2) continue; \
2719
        op_scale1(2); \
2720
        op_scale1(3); \
2721
        if(W==4) continue; \
2722
        op_scale1(4); \
2723
        op_scale1(5); \
2724
        op_scale1(6); \
2725
        op_scale1(7); \
2726
        if(W==8) continue; \
2727
        op_scale1(8); \
2728
        op_scale1(9); \
2729
        op_scale1(10); \
2730
        op_scale1(11); \
2731
        op_scale1(12); \
2732
        op_scale1(13); \
2733
        op_scale1(14); \
2734
        op_scale1(15); \
2735
    } \
2736
} \
2737
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2738
    int y; \
2739
    offset = ((offset + 1) | 1) << log2_denom; \
2740
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2741
        op_scale2(0); \
2742
        op_scale2(1); \
2743
        if(W==2) continue; \
2744
        op_scale2(2); \
2745
        op_scale2(3); \
2746
        if(W==4) continue; \
2747
        op_scale2(4); \
2748
        op_scale2(5); \
2749
        op_scale2(6); \
2750
        op_scale2(7); \
2751
        if(W==8) continue; \
2752
        op_scale2(8); \
2753
        op_scale2(9); \
2754
        op_scale2(10); \
2755
        op_scale2(11); \
2756
        op_scale2(12); \
2757
        op_scale2(13); \
2758
        op_scale2(14); \
2759
        op_scale2(15); \
2760
    } \
2761
}
2762

    
2763
H264_WEIGHT(16,16)
2764
H264_WEIGHT(16,8)
2765
H264_WEIGHT(8,16)
2766
H264_WEIGHT(8,8)
2767
H264_WEIGHT(8,4)
2768
H264_WEIGHT(4,8)
2769
H264_WEIGHT(4,4)
2770
H264_WEIGHT(4,2)
2771
H264_WEIGHT(2,4)
2772
H264_WEIGHT(2,2)
2773

    
2774
#undef op_scale1
2775
#undef op_scale2
2776
#undef H264_WEIGHT
2777

    
2778
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2779
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2780
    int i;
2781

    
2782
    for(i=0; i<h; i++){
2783
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2784
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2785
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2786
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2787
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2788
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2789
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2790
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2791
        dst+=dstStride;
2792
        src+=srcStride;
2793
    }
2794
}
2795

    
2796
#if CONFIG_CAVS_DECODER
2797
/* AVS specific */
2798
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2799

    
2800
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2801
    put_pixels8_c(dst, src, stride, 8);
2802
}
2803
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2804
    avg_pixels8_c(dst, src, stride, 8);
2805
}
2806
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2807
    put_pixels16_c(dst, src, stride, 16);
2808
}
2809
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2810
    avg_pixels16_c(dst, src, stride, 16);
2811
}
2812
#endif /* CONFIG_CAVS_DECODER */
2813

    
2814
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2815

    
2816
#if CONFIG_VC1_DECODER
2817
/* VC-1 specific */
2818
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2819

    
2820
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2821
    put_pixels8_c(dst, src, stride, 8);
2822
}
2823
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2824
    avg_pixels8_c(dst, src, stride, 8);
2825
}
2826
#endif /* CONFIG_VC1_DECODER */
2827

    
2828
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2829

    
2830
/* H264 specific */
2831
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2832

    
2833
#if CONFIG_RV30_DECODER
2834
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2835
#endif /* CONFIG_RV30_DECODER */
2836

    
2837
#if CONFIG_RV40_DECODER
2838
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2839
    put_pixels16_xy2_c(dst, src, stride, 16);
2840
}
2841
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2842
    avg_pixels16_xy2_c(dst, src, stride, 16);
2843
}
2844
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2845
    put_pixels8_xy2_c(dst, src, stride, 8);
2846
}
2847
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2848
    avg_pixels8_xy2_c(dst, src, stride, 8);
2849
}
2850

    
2851
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2852
#endif /* CONFIG_RV40_DECODER */
2853

    
2854
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2855
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2856
    int i;
2857

    
2858
    for(i=0; i<w; i++){
2859
        const int src_1= src[ -srcStride];
2860
        const int src0 = src[0          ];
2861
        const int src1 = src[  srcStride];
2862
        const int src2 = src[2*srcStride];
2863
        const int src3 = src[3*srcStride];
2864
        const int src4 = src[4*srcStride];
2865
        const int src5 = src[5*srcStride];
2866
        const int src6 = src[6*srcStride];
2867
        const int src7 = src[7*srcStride];
2868
        const int src8 = src[8*srcStride];
2869
        const int src9 = src[9*srcStride];
2870
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2871
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2872
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2873
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2874
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2875
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2876
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2877
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2878
        src++;
2879
        dst++;
2880
    }
2881
}
2882

    
2883
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2884
    put_pixels8_c(dst, src, stride, 8);
2885
}
2886

    
2887
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2888
    uint8_t half[64];
2889
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2890
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2891
}
2892

    
2893
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2894
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2895
}
2896

    
2897
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2898
    uint8_t half[64];
2899
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2900
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2901
}
2902

    
2903
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2904
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2905
}
2906

    
2907
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2908
    uint8_t halfH[88];
2909
    uint8_t halfV[64];
2910
    uint8_t halfHV[64];
2911
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2912
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2913
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2914
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2915
}
2916
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2917
    uint8_t halfH[88];
2918
    uint8_t halfV[64];
2919
    uint8_t halfHV[64];
2920
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2921
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2922
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2923
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2924
}
2925
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2926
    uint8_t halfH[88];
2927
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2928
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2929
}
2930

    
2931
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2932
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2933
    int x;
2934
    const int strength= ff_h263_loop_filter_strength[qscale];
2935

    
2936
    for(x=0; x<8; x++){
2937
        int d1, d2, ad1;
2938
        int p0= src[x-2*stride];
2939
        int p1= src[x-1*stride];
2940
        int p2= src[x+0*stride];
2941
        int p3= src[x+1*stride];
2942
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2943

    
2944
        if     (d<-2*strength) d1= 0;
2945
        else if(d<-  strength) d1=-2*strength - d;
2946
        else if(d<   strength) d1= d;
2947
        else if(d< 2*strength) d1= 2*strength - d;
2948
        else                   d1= 0;
2949

    
2950
        p1 += d1;
2951
        p2 -= d1;
2952
        if(p1&256) p1= ~(p1>>31);
2953
        if(p2&256) p2= ~(p2>>31);
2954

    
2955
        src[x-1*stride] = p1;
2956
        src[x+0*stride] = p2;
2957

    
2958
        ad1= FFABS(d1)>>1;
2959

    
2960
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2961

    
2962
        src[x-2*stride] = p0 - d2;
2963
        src[x+  stride] = p3 + d2;
2964
    }
2965
    }
2966
}
2967

    
2968
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2969
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2970
    int y;
2971
    const int strength= ff_h263_loop_filter_strength[qscale];
2972

    
2973
    for(y=0; y<8; y++){
2974
        int d1, d2, ad1;
2975
        int p0= src[y*stride-2];
2976
        int p1= src[y*stride-1];
2977
        int p2= src[y*stride+0];
2978
        int p3= src[y*stride+1];
2979
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2980

    
2981
        if     (d<-2*strength) d1= 0;
2982
        else if(d<-  strength) d1=-2*strength - d;
2983
        else if(d<   strength) d1= d;
2984
        else if(d< 2*strength) d1= 2*strength - d;
2985
        else                   d1= 0;
2986

    
2987
        p1 += d1;
2988
        p2 -= d1;
2989
        if(p1&256) p1= ~(p1>>31);
2990
        if(p2&256) p2= ~(p2>>31);
2991

    
2992
        src[y*stride-1] = p1;
2993
        src[y*stride+0] = p2;
2994

    
2995
        ad1= FFABS(d1)>>1;
2996

    
2997
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2998

    
2999
        src[y*stride-2] = p0 - d2;
3000
        src[y*stride+1] = p3 + d2;
3001
    }
3002
    }
3003
}
3004

    
3005
static void h261_loop_filter_c(uint8_t *src, int stride){
3006
    int x,y,xy,yz;
3007
    int temp[64];
3008

    
3009
    for(x=0; x<8; x++){
3010
        temp[x      ] = 4*src[x           ];
3011
        temp[x + 7*8] = 4*src[x + 7*stride];
3012
    }
3013
    for(y=1; y<7; y++){
3014
        for(x=0; x<8; x++){
3015
            xy = y * stride + x;
3016
            yz = y * 8 + x;
3017
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
3018
        }
3019
    }
3020

    
3021
    for(y=0; y<8; y++){
3022
        src[  y*stride] = (temp[  y*8] + 2)>>2;
3023
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3024
        for(x=1; x<7; x++){
3025
            xy = y * stride + x;
3026
            yz = y * 8 + x;
3027
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3028
        }
3029
    }
3030
}
3031

    
3032
static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3033
{
3034
    int i, d;
3035
    for( i = 0; i < 4; i++ ) {
3036
        if( tc0[i] < 0 ) {
3037
            pix += 4*ystride;
3038
            continue;
3039
        }
3040
        for( d = 0; d < 4; d++ ) {
3041
            const int p0 = pix[-1*xstride];
3042
            const int p1 = pix[-2*xstride];
3043
            const int p2 = pix[-3*xstride];
3044
            const int q0 = pix[0];
3045
            const int q1 = pix[1*xstride];
3046
            const int q2 = pix[2*xstride];
3047

    
3048
            if( FFABS( p0 - q0 ) < alpha &&
3049
                FFABS( p1 - p0 ) < beta &&
3050
                FFABS( q1 - q0 ) < beta ) {
3051

    
3052
                int tc = tc0[i];
3053
                int i_delta;
3054

    
3055
                if( FFABS( p2 - p0 ) < beta ) {
3056
                    if(tc0[i])
3057
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3058
                    tc++;
3059
                }
3060
                if( FFABS( q2 - q0 ) < beta ) {
3061
                    if(tc0[i])
3062
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3063
                    tc++;
3064
                }
3065

    
3066
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3067
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3068
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3069
            }
3070
            pix += ystride;
3071
        }
3072
    }
3073
}
3074
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3075
{
3076
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3077
}
3078
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3079
{
3080
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3081
}
3082

    
3083
static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3084
{
3085
    int d;
3086
    for( d = 0; d < 16; d++ ) {
3087
        const int p2 = pix[-3*xstride];
3088
        const int p1 = pix[-2*xstride];
3089
        const int p0 = pix[-1*xstride];
3090

    
3091
        const int q0 = pix[ 0*xstride];
3092
        const int q1 = pix[ 1*xstride];
3093
        const int q2 = pix[ 2*xstride];
3094

    
3095
        if( FFABS( p0 - q0 ) < alpha &&
3096
            FFABS( p1 - p0 ) < beta &&
3097
            FFABS( q1 - q0 ) < beta ) {
3098

    
3099
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3100
                if( FFABS( p2 - p0 ) < beta)
3101
                {
3102
                    const int p3 = pix[-4*xstride];
3103
                    /* p0', p1', p2' */
3104
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3105
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3106
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3107
                } else {
3108
                    /* p0' */
3109
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3110
                }
3111
                if( FFABS( q2 - q0 ) < beta)
3112
                {
3113
                    const int q3 = pix[3*xstride];
3114
                    /* q0', q1', q2' */
3115
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3116
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3117
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3118
                } else {
3119
                    /* q0' */
3120
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3121
                }
3122
            }else{
3123
                /* p0', q0' */
3124
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3125
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3126
            }
3127
        }
3128
        pix += ystride;
3129
    }
3130
}
3131
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3132
{
3133
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3134
}
3135
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3136
{
3137
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3138
}
3139

    
3140
static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3141
{
3142
    int i, d;
3143
    for( i = 0; i < 4; i++ ) {
3144
        const int tc = tc0[i];
3145
        if( tc <= 0 ) {
3146
            pix += 2*ystride;
3147
            continue;
3148
        }
3149
        for( d = 0; d < 2; d++ ) {
3150
            const int p0 = pix[-1*xstride];
3151
            const int p1 = pix[-2*xstride];
3152
            const int q0 = pix[0];
3153
            const int q1 = pix[1*xstride];
3154

    
3155
            if( FFABS( p0 - q0 ) < alpha &&
3156
                FFABS( p1 - p0 ) < beta &&
3157
                FFABS( q1 - q0 ) < beta ) {
3158

    
3159
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3160

    
3161
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3162
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3163
            }
3164
            pix += ystride;
3165
        }
3166
    }
3167
}
3168
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3169
{
3170
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3171
}
3172
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3173
{
3174
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3175
}
3176

    
3177
static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3178
{
3179
    int d;
3180
    for( d = 0; d < 8; d++ ) {
3181
        const int p0 = pix[-1*xstride];
3182
        const int p1 = pix[-2*xstride];
3183
        const int q0 = pix[0];
3184
        const int q1 = pix[1*xstride];
3185

    
3186
        if( FFABS( p0 - q0 ) < alpha &&
3187
            FFABS( p1 - p0 ) < beta &&
3188
            FFABS( q1 - q0 ) < beta ) {
3189

    
3190
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3191
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3192
        }
3193
        pix += ystride;
3194
    }
3195
}
3196
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3197
{
3198
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3199
}
3200
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3201
{
3202
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3203
}
3204

    
3205
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3206
{
3207
    int s, i;
3208

    
3209
    s = 0;
3210
    for(i=0;i<h;i++) {
3211
        s += abs(pix1[0] - pix2[0]);
3212
        s += abs(pix1[1] - pix2[1]);
3213
        s += abs(pix1[2] - pix2[2]);
3214
        s += abs(pix1[3] - pix2[3]);
3215
        s += abs(pix1[4] - pix2[4]);
3216
        s += abs(pix1[5] - pix2[5]);
3217
        s += abs(pix1[6] - pix2[6]);
3218
        s += abs(pix1[7] - pix2[7]);
3219
        s += abs(pix1[8] - pix2[8]);
3220
        s += abs(pix1[9] - pix2[9]);
3221
        s += abs(pix1[10] - pix2[10]);
3222
        s += abs(pix1[11] - pix2[11]);
3223
        s += abs(pix1[12] - pix2[12]);
3224
        s += abs(pix1[13] - pix2[13]);
3225
        s += abs(pix1[14] - pix2[14]);
3226
        s += abs(pix1[15] - pix2[15]);
3227
        pix1 += line_size;
3228
        pix2 += line_size;
3229
    }
3230
    return s;
3231
}
3232

    
3233
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234
{
3235
    int s, i;
3236

    
3237
    s = 0;
3238
    for(i=0;i<h;i++) {
3239
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3240
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3241
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3242
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3243
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3244
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3245
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3246
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3247
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3248
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3249
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3250
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3251
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3252
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3253
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3254
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3255
        pix1 += line_size;
3256
        pix2 += line_size;
3257
    }
3258
    return s;
3259
}
3260

    
3261
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3262
{
3263
    int s, i;
3264
    uint8_t *pix3 = pix2 + line_size;
3265

    
3266
    s = 0;
3267
    for(i=0;i<h;i++) {
3268
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3269
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3270
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3271
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3272
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3273
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3274
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3275
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3276
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3277
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3278
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3279
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3280
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3281
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3282
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3283
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3284
        pix1 += line_size;
3285
        pix2 += line_size;
3286
        pix3 += line_size;
3287
    }
3288
    return s;
3289
}
3290

    
3291
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3292
{
3293
    int s, i;
3294
    uint8_t *pix3 = pix2 + line_size;
3295

    
3296
    s = 0;
3297
    for(i=0;i<h;i++) {
3298
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3299
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3300
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3301
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3302
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3303
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3304
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3305
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3306
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3307
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3308
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3309
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3310
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3311
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3312
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3313
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3314
        pix1 += line_size;
3315
        pix2 += line_size;
3316
        pix3 += line_size;
3317
    }
3318
    return s;
3319
}
3320

    
3321
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3322
{
3323
    int s, i;
3324

    
3325
    s = 0;
3326
    for(i=0;i<h;i++) {
3327
        s += abs(pix1[0] - pix2[0]);
3328
        s += abs(pix1[1] - pix2[1]);
3329
        s += abs(pix1[2] - pix2[2]);
3330
        s += abs(pix1[3] - pix2[3]);
3331
        s += abs(pix1[4] - pix2[4]);
3332
        s += abs(pix1[5] - pix2[5]);
3333
        s += abs(pix1[6] - pix2[6]);
3334
        s += abs(pix1[7] - pix2[7]);
3335
        pix1 += line_size;
3336
        pix2 += line_size;
3337
    }
3338
    return s;
3339
}
3340

    
3341
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3342
{
3343
    int s, i;
3344

    
3345
    s = 0;
3346
    for(i=0;i<h;i++) {
3347
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3348
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3349
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3350
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3351
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3352
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3353
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3354
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3355
        pix1 += line_size;
3356
        pix2 += line_size;
3357
    }
3358
    return s;
3359
}
3360

    
3361
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3362
{
3363
    int s, i;
3364
    uint8_t *pix3 = pix2 + line_size;
3365

    
3366
    s = 0;
3367
    for(i=0;i<h;i++) {
3368
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3369
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3370
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3371
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3372
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3373
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3374
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3375
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3376
        pix1 += line_size;
3377
        pix2 += line_size;
3378
        pix3 += line_size;
3379
    }
3380
    return s;
3381
}
3382

    
3383
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3384
{
3385
    int s, i;
3386
    uint8_t *pix3 = pix2 + line_size;
3387

    
3388
    s = 0;
3389
    for(i=0;i<h;i++) {
3390
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3391
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3392
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3393
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3394
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3395
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3396
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3397
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3398
        pix1 += line_size;
3399
        pix2 += line_size;
3400
        pix3 += line_size;
3401
    }
3402
    return s;
3403
}
3404

    
3405
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3406
    MpegEncContext *c = v;
3407
    int score1=0;
3408
    int score2=0;
3409
    int x,y;
3410

    
3411
    for(y=0; y<h; y++){
3412
        for(x=0; x<16; x++){
3413
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3414
        }
3415
        if(y+1<h){
3416
            for(x=0; x<15; x++){
3417
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3418
                             - s1[x+1] + s1[x+1+stride])
3419
                        -FFABS(  s2[x  ] - s2[x  +stride]
3420
                             - s2[x+1] + s2[x+1+stride]);
3421
            }
3422
        }
3423
        s1+= stride;
3424
        s2+= stride;
3425
    }
3426

    
3427
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3428
    else  return score1 + FFABS(score2)*8;
3429
}
3430

    
3431
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3432
    MpegEncContext *c = v;
3433
    int score1=0;
3434
    int score2=0;
3435
    int x,y;
3436

    
3437
    for(y=0; y<h; y++){
3438
        for(x=0; x<8; x++){
3439
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3440
        }
3441
        if(y+1<h){
3442
            for(x=0; x<7; x++){
3443
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3444
                             - s1[x+1] + s1[x+1+stride])
3445
                        -FFABS(  s2[x  ] - s2[x  +stride]
3446
                             - s2[x+1] + s2[x+1+stride]);
3447
            }
3448
        }
3449
        s1+= stride;
3450
        s2+= stride;
3451
    }
3452

    
3453
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3454
    else  return score1 + FFABS(score2)*8;
3455
}
3456

    
3457
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3458
    int i;
3459
    unsigned int sum=0;
3460

    
3461
    for(i=0; i<8*8; i++){
3462
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3463
        int w= weight[i];
3464
        b>>= RECON_SHIFT;
3465
        assert(-512<b && b<512);
3466

    
3467
        sum += (w*b)*(w*b)>>4;
3468
    }
3469
    return sum>>2;
3470
}
3471

    
3472
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3473
    int i;
3474

    
3475
    for(i=0; i<8*8; i++){
3476
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3477
    }
3478
}
3479

    
3480
/**
3481
 * permutes an 8x8 block.
3482
 * @param block the block which will be permuted according to the given permutation vector
3483
 * @param permutation the permutation vector
3484
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3485
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3486
 *                  (inverse) permutated to scantable order!
3487
 */
3488
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3489
{
3490
    int i;
3491
    DCTELEM temp[64];
3492

    
3493
    if(last<=0) return;
3494
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3495

    
3496
    for(i=0; i<=last; i++){
3497
        const int j= scantable[i];
3498
        temp[j]= block[j];
3499
        block[j]=0;
3500
    }
3501

    
3502
    for(i=0; i<=last; i++){
3503
        const int j= scantable[i];
3504
        const int perm_j= permutation[j];
3505
        block[perm_j]= temp[j];
3506
    }
3507
}
3508

    
3509
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3510
    return 0;
3511
}
3512

    
3513
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3514
    int i;
3515

    
3516
    memset(cmp, 0, sizeof(void*)*6);
3517

    
3518
    for(i=0; i<6; i++){
3519
        switch(type&0xFF){
3520
        case FF_CMP_SAD:
3521
            cmp[i]= c->sad[i];
3522
            break;
3523
        case FF_CMP_SATD:
3524
            cmp[i]= c->hadamard8_diff[i];
3525
            break;
3526
        case FF_CMP_SSE:
3527
            cmp[i]= c->sse[i];
3528
            break;
3529
        case FF_CMP_DCT:
3530
            cmp[i]= c->dct_sad[i];
3531
            break;
3532
        case FF_CMP_DCT264:
3533
            cmp[i]= c->dct264_sad[i];
3534
            break;
3535
        case FF_CMP_DCTMAX:
3536
            cmp[i]= c->dct_max[i];
3537
            break;
3538
        case FF_CMP_PSNR:
3539
            cmp[i]= c->quant_psnr[i];
3540
            break;
3541
        case FF_CMP_BIT:
3542
            cmp[i]= c->bit[i];
3543
            break;
3544
        case FF_CMP_RD:
3545
            cmp[i]= c->rd[i];
3546
            break;
3547
        case FF_CMP_VSAD:
3548
            cmp[i]= c->vsad[i];
3549
            break;
3550
        case FF_CMP_VSSE:
3551
            cmp[i]= c->vsse[i];
3552
            break;
3553
        case FF_CMP_ZERO:
3554
            cmp[i]= zero_cmp;
3555
            break;
3556
        case FF_CMP_NSSE:
3557
            cmp[i]= c->nsse[i];
3558
            break;
3559
#if CONFIG_SNOW_ENCODER
3560
        case FF_CMP_W53:
3561
            cmp[i]= c->w53[i];
3562
            break;
3563
        case FF_CMP_W97:
3564
            cmp[i]= c->w97[i];
3565
            break;
3566
#endif
3567
        default:
3568
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3569
        }
3570
    }
3571
}
3572

    
3573
static void clear_block_c(DCTELEM *block)
3574
{
3575
    memset(block, 0, sizeof(DCTELEM)*64);
3576
}
3577

    
3578
/**
3579
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3580
 */
3581
static void clear_blocks_c(DCTELEM *blocks)
3582
{
3583
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3584
}
3585

    
3586
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3587
    long i;
3588
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3589
        long a = *(long*)(src+i);
3590
        long b = *(long*)(dst+i);
3591
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3592
    }
3593
    for(; i<w; i++)
3594
        dst[i+0] += src[i+0];
3595
}
3596

    
3597
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3598
    long i;
3599
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3600
        long a = *(long*)(src1+i);
3601
        long b = *(long*)(src2+i);
3602
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3603
    }
3604
    for(; i<w; i++)
3605
        dst[i] = src1[i]+src2[i];
3606
}
3607

    
3608
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3609
    long i;
3610
#if !HAVE_FAST_UNALIGNED
3611
    if((long)src2 & (sizeof(long)-1)){
3612
        for(i=0; i+7<w; i+=8){
3613
            dst[i+0] = src1[i+0]-src2[i+0];
3614
            dst[i+1] = src1[i+1]-src2[i+1];
3615
            dst[i+2] = src1[i+2]-src2[i+2];
3616
            dst[i+3] = src1[i+3]-src2[i+3];
3617
            dst[i+4] = src1[i+4]-src2[i+4];
3618
            dst[i+5] = src1[i+5]-src2[i+5];
3619
            dst[i+6] = src1[i+6]-src2[i+6];
3620
            dst[i+7] = src1[i+7]-src2[i+7];
3621
        }
3622
    }else
3623
#endif
3624
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3625
        long a = *(long*)(src1+i);
3626
        long b = *(long*)(src2+i);
3627
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3628
    }
3629
    for(; i<w; i++)
3630
        dst[i+0] = src1[i+0]-src2[i+0];
3631
}
3632

    
3633
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3634
    int i;
3635
    uint8_t l, lt;
3636

    
3637
    l= *left;
3638
    lt= *left_top;
3639

    
3640
    for(i=0; i<w; i++){
3641
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3642
        lt= src1[i];
3643
        dst[i]= l;
3644
    }
3645

    
3646
    *left= l;
3647
    *left_top= lt;
3648
}
3649

    
3650
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3651
    int i;
3652
    uint8_t l, lt;
3653

    
3654
    l= *left;
3655
    lt= *left_top;
3656

    
3657
    for(i=0; i<w; i++){
3658
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3659
        lt= src1[i];
3660
        l= src2[i];
3661
        dst[i]= l - pred;
3662
    }
3663

    
3664
    *left= l;
3665
    *left_top= lt;
3666
}
3667

    
3668
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3669
    int i;
3670

    
3671
    for(i=0; i<w-1; i++){
3672
        acc+= src[i];
3673
        dst[i]= acc;
3674
        i++;
3675
        acc+= src[i];
3676
        dst[i]= acc;
3677
    }
3678

    
3679
    for(; i<w; i++){
3680
        acc+= src[i];
3681
        dst[i]= acc;
3682
    }
3683

    
3684
    return acc;
3685
}
3686

    
3687
#if HAVE_BIGENDIAN
3688
#define B 3
3689
#define G 2
3690
#define R 1
3691
#define A 0
3692
#else
3693
#define B 0
3694
#define G 1
3695
#define R 2
3696
#define A 3
3697
#endif
3698
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3699
    int i;
3700
    int r,g,b,a;
3701
    r= *red;
3702
    g= *green;
3703
    b= *blue;
3704
    a= *alpha;
3705

    
3706
    for(i=0; i<w; i++){
3707
        b+= src[4*i+B];
3708
        g+= src[4*i+G];
3709
        r+= src[4*i+R];
3710
        a+= src[4*i+A];
3711

    
3712
        dst[4*i+B]= b;
3713
        dst[4*i+G]= g;
3714
        dst[4*i+R]= r;
3715
        dst[4*i+A]= a;
3716
    }
3717

    
3718
    *red= r;
3719
    *green= g;
3720
    *blue= b;
3721
    *alpha= a;
3722
}
3723
#undef B
3724
#undef G
3725
#undef R
3726
#undef A
3727

    
3728
#define BUTTERFLY2(o1,o2,i1,i2) \
3729
o1= (i1)+(i2);\
3730
o2= (i1)-(i2);
3731

    
3732
#define BUTTERFLY1(x,y) \
3733
{\
3734
    int a,b;\
3735
    a= x;\
3736
    b= y;\
3737
    x= a+b;\
3738
    y= a-b;\
3739
}
3740

    
3741
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3742

    
3743
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3744
    int i;
3745
    int temp[64];
3746
    int sum=0;
3747

    
3748
    assert(h==8);
3749

    
3750
    for(i=0; i<8; i++){
3751
        //FIXME try pointer walks
3752
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3753
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3754
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3755
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3756

    
3757
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3758
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3759
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3760
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3761

    
3762
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3763
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3764
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3765
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3766
    }
3767

    
3768
    for(i=0; i<8; i++){
3769
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3770
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3771
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3772
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3773

    
3774
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3775
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3776
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3777
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3778

    
3779
        sum +=
3780
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3781
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3782
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3783
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3784
    }
3785
#if 0
3786
static int maxi=0;
3787
if(sum>maxi){
3788
    maxi=sum;
3789
    printf("MAX:%d\n", maxi);
3790
}
3791
#endif
3792
    return sum;
3793
}
3794

    
3795
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3796
    int i;
3797
    int temp[64];
3798
    int sum=0;
3799

    
3800
    assert(h==8);
3801

    
3802
    for(i=0; i<8; i++){
3803
        //FIXME try pointer walks
3804
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3805
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3806
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3807
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3808

    
3809
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3810
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3811
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3812
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3813

    
3814
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3815
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3816
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3817
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3818
    }
3819

    
3820
    for(i=0; i<8; i++){
3821
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3822
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3823
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3824
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3825

    
3826
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3827
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3828
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3829
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3830

    
3831
        sum +=
3832
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3833
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3834
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3835
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3836
    }
3837

    
3838
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3839

    
3840
    return sum;
3841
}
3842

    
3843
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3844
    MpegEncContext * const s= (MpegEncContext *)c;
3845
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3846

    
3847
    assert(h==8);
3848

    
3849
    s->dsp.diff_pixels(temp, src1, src2, stride);
3850
    s->dsp.fdct(temp);
3851
    return s->dsp.sum_abs_dctelem(temp);
3852
}
3853

    
3854
#if CONFIG_GPL
3855
#define DCT8_1D {\
3856
    const int s07 = SRC(0) + SRC(7);\
3857
    const int s16 = SRC(1) + SRC(6);\
3858
    const int s25 = SRC(2) + SRC(5);\
3859
    const int s34 = SRC(3) + SRC(4);\
3860
    const int a0 = s07 + s34;\
3861
    const int a1 = s16 + s25;\
3862
    const int a2 = s07 - s34;\
3863
    const int a3 = s16 - s25;\
3864
    const int d07 = SRC(0) - SRC(7);\
3865
    const int d16 = SRC(1) - SRC(6);\
3866
    const int d25 = SRC(2) - SRC(5);\
3867
    const int d34 = SRC(3) - SRC(4);\
3868
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3869
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3870
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3871
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3872
    DST(0,  a0 + a1     ) ;\
3873
    DST(1,  a4 + (a7>>2)) ;\
3874
    DST(2,  a2 + (a3>>1)) ;\
3875
    DST(3,  a5 + (a6>>2)) ;\
3876
    DST(4,  a0 - a1     ) ;\
3877
    DST(5,  a6 - (a5>>2)) ;\
3878
    DST(6, (a2>>1) - a3 ) ;\
3879
    DST(7, (a4>>2) - a7 ) ;\
3880
}
3881

    
3882
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3883
    MpegEncContext * const s= (MpegEncContext *)c;
3884
    DCTELEM dct[8][8];
3885
    int i;
3886
    int sum=0;
3887

    
3888
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3889

    
3890
#define SRC(x) dct[i][x]
3891
#define DST(x,v) dct[i][x]= v
3892
    for( i = 0; i < 8; i++ )
3893
        DCT8_1D
3894
#undef SRC
3895
#undef DST
3896

    
3897
#define SRC(x) dct[x][i]
3898
#define DST(x,v) sum += FFABS(v)
3899
    for( i = 0; i < 8; i++ )
3900
        DCT8_1D
3901
#undef SRC
3902
#undef DST
3903
    return sum;
3904
}
3905
#endif
3906

    
3907
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3908
    MpegEncContext * const s= (MpegEncContext *)c;
3909
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3910
    int sum=0, i;
3911

    
3912
    assert(h==8);
3913

    
3914
    s->dsp.diff_pixels(temp, src1, src2, stride);
3915
    s->dsp.fdct(temp);
3916

    
3917
    for(i=0; i<64; i++)
3918
        sum= FFMAX(sum, FFABS(temp[i]));
3919

    
3920
    return sum;
3921
}
3922

    
3923
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3924
    MpegEncContext * const s= (MpegEncContext *)c;
3925
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3926
    DCTELEM * const bak = temp+64;
3927
    int sum=0, i;
3928

    
3929
    assert(h==8);
3930
    s->mb_intra=0;
3931

    
3932
    s->dsp.diff_pixels(temp, src1, src2, stride);
3933

    
3934
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3935

    
3936
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3937
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3938
    ff_simple_idct(temp); //FIXME
3939

    
3940
    for(i=0; i<64; i++)
3941
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3942

    
3943
    return sum;
3944
}
3945

    
3946
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3947
    MpegEncContext * const s= (MpegEncContext *)c;
3948
    const uint8_t *scantable= s->intra_scantable.permutated;
3949
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3950
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3951
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3952
    int i, last, run, bits, level, distortion, start_i;
3953
    const int esc_length= s->ac_esc_length;
3954
    uint8_t * length;
3955
    uint8_t * last_length;
3956

    
3957
    assert(h==8);
3958

    
3959
    copy_block8(lsrc1, src1, 8, stride, 8);
3960
    copy_block8(lsrc2, src2, 8, stride, 8);
3961

    
3962
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3963

    
3964
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3965

    
3966
    bits=0;
3967

    
3968
    if (s->mb_intra) {
3969
        start_i = 1;
3970
        length     = s->intra_ac_vlc_length;
3971
        last_length= s->intra_ac_vlc_last_length;
3972
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3973
    } else {
3974
        start_i = 0;
3975
        length     = s->inter_ac_vlc_length;
3976
        last_length= s->inter_ac_vlc_last_length;
3977
    }
3978

    
3979
    if(last>=start_i){
3980
        run=0;
3981
        for(i=start_i; i<last; i++){
3982
            int j= scantable[i];
3983
            level= temp[j];
3984

    
3985
            if(level){
3986
                level+=64;
3987
                if((level&(~127)) == 0){
3988
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3989
                }else
3990
                    bits+= esc_length;
3991
                run=0;
3992
            }else
3993
                run++;
3994
        }
3995
        i= scantable[last];
3996

    
3997
        level= temp[i] + 64;
3998

    
3999
        assert(level - 64);
4000

    
4001
        if((level&(~127)) == 0){
4002
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4003
        }else
4004
            bits+= esc_length;
4005

    
4006
    }
4007

    
4008
    if(last>=0){
4009
        if(s->mb_intra)
4010
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
4011
        else
4012
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
4013
    }
4014

    
4015
    s->dsp.idct_add(lsrc2, 8, temp);
4016

    
4017
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
4018

    
4019
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
4020
}
4021

    
4022
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
4023
    MpegEncContext * const s= (MpegEncContext *)c;
4024
    const uint8_t *scantable= s->intra_scantable.permutated;
4025
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
4026
    int i, last, run, bits, level, start_i;
4027
    const int esc_length= s->ac_esc_length;
4028
    uint8_t * length;
4029
    uint8_t * last_length;
4030

    
4031
    assert(h==8);
4032

    
4033
    s->dsp.diff_pixels(temp, src1, src2, stride);
4034

    
4035
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
4036

    
4037
    bits=0;
4038

    
4039
    if (s->mb_intra) {
4040
        start_i = 1;
4041
        length     = s->intra_ac_vlc_length;
4042
        last_length= s->intra_ac_vlc_last_length;
4043
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
4044
    } else {
4045
        start_i = 0;
4046
        length     = s->inter_ac_vlc_length;
4047
        last_length= s->inter_ac_vlc_last_length;
4048
    }
4049

    
4050
    if(last>=start_i){
4051
        run=0;
4052
        for(i=start_i; i<last; i++){
4053
            int j= scantable[i];
4054
            level= temp[j];
4055

    
4056
            if(level){
4057
                level+=64;
4058
                if((level&(~127)) == 0){
4059
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
4060
                }else
4061
                    bits+= esc_length;
4062
                run=0;
4063
            }else
4064
                run++;
4065
        }
4066
        i= scantable[last];
4067

    
4068
        level= temp[i] + 64;
4069

    
4070
        assert(level - 64);
4071

    
4072
        if((level&(~127)) == 0){
4073
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4074
        }else
4075
            bits+= esc_length;
4076
    }
4077

    
4078
    return bits;
4079
}
4080

    
4081
#define VSAD_INTRA(size) \
4082
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4083
    int score=0;                                                                                            \
4084
    int x,y;                                                                                                \
4085
                                                                                                            \
4086
    for(y=1; y<h; y++){                                                                                     \
4087
        for(x=0; x<size; x+=4){                                                                             \
4088
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4089
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4090
        }                                                                                                   \
4091
        s+= stride;                                                                                         \
4092
    }                                                                                                       \
4093
                                                                                                            \
4094
    return score;                                                                                           \
4095
}
4096
VSAD_INTRA(8)
4097
VSAD_INTRA(16)
4098

    
4099
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4100
    int score=0;
4101
    int x,y;
4102

    
4103
    for(y=1; y<h; y++){
4104
        for(x=0; x<16; x++){
4105
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4106
        }
4107
        s1+= stride;
4108
        s2+= stride;
4109
    }
4110

    
4111
    return score;
4112
}
4113

    
4114
#define SQ(a) ((a)*(a))
4115
#define VSSE_INTRA(size) \
4116
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4117
    int score=0;                                                                                            \
4118
    int x,y;                                                                                                \
4119
                                                                                                            \
4120
    for(y=1; y<h; y++){                                                                                     \
4121
        for(x=0; x<size; x+=4){                                                                               \
4122
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4123
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4124
        }                                                                                                   \
4125
        s+= stride;                                                                                         \
4126
    }                                                                                                       \
4127
                                                                                                            \
4128
    return score;                                                                                           \
4129
}
4130
VSSE_INTRA(8)
4131
VSSE_INTRA(16)
4132

    
4133
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4134
    int score=0;
4135
    int x,y;
4136

    
4137
    for(y=1; y<h; y++){
4138
        for(x=0; x<16; x++){
4139
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4140
        }
4141
        s1+= stride;
4142
        s2+= stride;
4143
    }
4144

    
4145
    return score;
4146
}
4147

    
4148
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4149
                               int size){
4150
    int score=0;
4151
    int i;
4152
    for(i=0; i<size; i++)
4153
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4154
    return score;
4155
}
4156

    
4157
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4158
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4159
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4160
#if CONFIG_GPL
4161
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4162
#endif
4163
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4164
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4165
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4166
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4167

    
4168
static void vector_fmul_c(float *dst, const float *src, int len){
4169
    int i;
4170
    for(i=0; i<len; i++)
4171
        dst[i] *= src[i];
4172
}
4173

    
4174
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4175
    int i;
4176
    src1 += len-1;
4177
    for(i=0; i<len; i++)
4178
        dst[i] = src0[i] * src1[-i];
4179
}
4180

    
4181
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4182
    int i;
4183
    for(i=0; i<len; i++)
4184
        dst[i] = src0[i] * src1[i] + src2[i];
4185
}
4186

    
4187
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4188
    int i,j;
4189
    dst += len;
4190
    win += len;
4191
    src0+= len;
4192
    for(i=-len, j=len-1; i<0; i++, j--) {
4193
        float s0 = src0[i];
4194
        float s1 = src1[j];
4195
        float wi = win[i];
4196
        float wj = win[j];
4197
        dst[i] = s0*wj - s1*wi + add_bias;
4198
        dst[j] = s0*wi + s1*wj + add_bias;
4199
    }
4200
}
4201

    
4202
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4203
                                 int len)
4204
{
4205
    int i;
4206
    for (i = 0; i < len; i++)
4207
        dst[i] = src[i] * mul;
4208
}
4209

    
4210
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4211
                                      const float **sv, float mul, int len)
4212
{
4213
    int i;
4214
    for (i = 0; i < len; i += 2, sv++) {
4215
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4216
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4217
    }
4218
}
4219

    
4220
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4221
                                      const float **sv, float mul, int len)
4222
{
4223
    int i;
4224
    for (i = 0; i < len; i += 4, sv++) {
4225
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4226
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4227
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4228
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4229
    }
4230
}
4231

    
4232
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4233
                               int len)
4234
{
4235
    int i;
4236
    for (i = 0; i < len; i += 2, sv++) {
4237
        dst[i  ] = sv[0][0] * mul;
4238
        dst[i+1] = sv[0][1] * mul;