Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 84dc2d8a

History | View | Annotate | Download (175 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "snow.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39

    
40
/* snow.c */
41
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42

    
43
/* vorbis.c */
44
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45

    
46
/* ac3dec.c */
47
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48

    
49
/* lpc.c */
50
void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
51

    
52
/* pngdec.c */
53
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54

    
55
/* eaidct.c */
56
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57

    
58
/* binkidct.c */
59
void ff_bink_idct_c    (DCTELEM *block);
60
void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
61
void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
62

    
63
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
64
uint32_t ff_squareTbl[512] = {0, };
65

    
66
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
67
#define pb_7f (~0UL/255 * 0x7f)
68
#define pb_80 (~0UL/255 * 0x80)
69

    
70
const uint8_t ff_zigzag_direct[64] = {
71
    0,   1,  8, 16,  9,  2,  3, 10,
72
    17, 24, 32, 25, 18, 11,  4,  5,
73
    12, 19, 26, 33, 40, 48, 41, 34,
74
    27, 20, 13,  6,  7, 14, 21, 28,
75
    35, 42, 49, 56, 57, 50, 43, 36,
76
    29, 22, 15, 23, 30, 37, 44, 51,
77
    58, 59, 52, 45, 38, 31, 39, 46,
78
    53, 60, 61, 54, 47, 55, 62, 63
79
};
80

    
81
/* Specific zigzag scan for 248 idct. NOTE that unlike the
82
   specification, we interleave the fields */
83
const uint8_t ff_zigzag248_direct[64] = {
84
     0,  8,  1,  9, 16, 24,  2, 10,
85
    17, 25, 32, 40, 48, 56, 33, 41,
86
    18, 26,  3, 11,  4, 12, 19, 27,
87
    34, 42, 49, 57, 50, 58, 35, 43,
88
    20, 28,  5, 13,  6, 14, 21, 29,
89
    36, 44, 51, 59, 52, 60, 37, 45,
90
    22, 30,  7, 15, 23, 31, 38, 46,
91
    53, 61, 54, 62, 39, 47, 55, 63,
92
};
93

    
94
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
95
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
96

    
97
const uint8_t ff_alternate_horizontal_scan[64] = {
98
    0,  1,   2,  3,  8,  9, 16, 17,
99
    10, 11,  4,  5,  6,  7, 15, 14,
100
    13, 12, 19, 18, 24, 25, 32, 33,
101
    26, 27, 20, 21, 22, 23, 28, 29,
102
    30, 31, 34, 35, 40, 41, 48, 49,
103
    42, 43, 36, 37, 38, 39, 44, 45,
104
    46, 47, 50, 51, 56, 57, 58, 59,
105
    52, 53, 54, 55, 60, 61, 62, 63,
106
};
107

    
108
const uint8_t ff_alternate_vertical_scan[64] = {
109
    0,  8,  16, 24,  1,  9,  2, 10,
110
    17, 25, 32, 40, 48, 56, 57, 49,
111
    41, 33, 26, 18,  3, 11,  4, 12,
112
    19, 27, 34, 42, 50, 58, 35, 43,
113
    51, 59, 20, 28,  5, 13,  6, 14,
114
    21, 29, 36, 44, 52, 60, 37, 45,
115
    53, 61, 22, 30,  7, 15, 23, 31,
116
    38, 46, 54, 62, 39, 47, 55, 63,
117
};
118

    
119
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
120
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
121
const uint32_t ff_inverse[257]={
122
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
123
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
124
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
125
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
126
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
127
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
128
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
129
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
130
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
131
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
132
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
133
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
134
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
135
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
136
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
137
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
138
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
139
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
140
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
141
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
142
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
143
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
144
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
145
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
146
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
147
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
148
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
149
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
150
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
151
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
152
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
153
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
154
  16777216
155
};
156

    
157
/* Input permutation for the simple_idct_mmx */
158
static const uint8_t simple_mmx_permutation[64]={
159
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
160
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
161
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
162
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
163
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
164
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
165
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
166
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
167
};
168

    
169
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
170

    
171
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
172
    int i;
173
    int end;
174

    
175
    st->scantable= src_scantable;
176

    
177
    for(i=0; i<64; i++){
178
        int j;
179
        j = src_scantable[i];
180
        st->permutated[i] = permutation[j];
181
#if ARCH_PPC
182
        st->inverse[j] = i;
183
#endif
184
    }
185

    
186
    end=-1;
187
    for(i=0; i<64; i++){
188
        int j;
189
        j = st->permutated[i];
190
        if(j>end) end=j;
191
        st->raster_end[i]= end;
192
    }
193
}
194

    
195
static int pix_sum_c(uint8_t * pix, int line_size)
196
{
197
    int s, i, j;
198

    
199
    s = 0;
200
    for (i = 0; i < 16; i++) {
201
        for (j = 0; j < 16; j += 8) {
202
            s += pix[0];
203
            s += pix[1];
204
            s += pix[2];
205
            s += pix[3];
206
            s += pix[4];
207
            s += pix[5];
208
            s += pix[6];
209
            s += pix[7];
210
            pix += 8;
211
        }
212
        pix += line_size - 16;
213
    }
214
    return s;
215
}
216

    
217
static int pix_norm1_c(uint8_t * pix, int line_size)
218
{
219
    int s, i, j;
220
    uint32_t *sq = ff_squareTbl + 256;
221

    
222
    s = 0;
223
    for (i = 0; i < 16; i++) {
224
        for (j = 0; j < 16; j += 8) {
225
#if 0
226
            s += sq[pix[0]];
227
            s += sq[pix[1]];
228
            s += sq[pix[2]];
229
            s += sq[pix[3]];
230
            s += sq[pix[4]];
231
            s += sq[pix[5]];
232
            s += sq[pix[6]];
233
            s += sq[pix[7]];
234
#else
235
#if LONG_MAX > 2147483647
236
            register uint64_t x=*(uint64_t*)pix;
237
            s += sq[x&0xff];
238
            s += sq[(x>>8)&0xff];
239
            s += sq[(x>>16)&0xff];
240
            s += sq[(x>>24)&0xff];
241
            s += sq[(x>>32)&0xff];
242
            s += sq[(x>>40)&0xff];
243
            s += sq[(x>>48)&0xff];
244
            s += sq[(x>>56)&0xff];
245
#else
246
            register uint32_t x=*(uint32_t*)pix;
247
            s += sq[x&0xff];
248
            s += sq[(x>>8)&0xff];
249
            s += sq[(x>>16)&0xff];
250
            s += sq[(x>>24)&0xff];
251
            x=*(uint32_t*)(pix+4);
252
            s += sq[x&0xff];
253
            s += sq[(x>>8)&0xff];
254
            s += sq[(x>>16)&0xff];
255
            s += sq[(x>>24)&0xff];
256
#endif
257
#endif
258
            pix += 8;
259
        }
260
        pix += line_size - 16;
261
    }
262
    return s;
263
}
264

    
265
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
266
    int i;
267

    
268
    for(i=0; i+8<=w; i+=8){
269
        dst[i+0]= bswap_32(src[i+0]);
270
        dst[i+1]= bswap_32(src[i+1]);
271
        dst[i+2]= bswap_32(src[i+2]);
272
        dst[i+3]= bswap_32(src[i+3]);
273
        dst[i+4]= bswap_32(src[i+4]);
274
        dst[i+5]= bswap_32(src[i+5]);
275
        dst[i+6]= bswap_32(src[i+6]);
276
        dst[i+7]= bswap_32(src[i+7]);
277
    }
278
    for(;i<w; i++){
279
        dst[i+0]= bswap_32(src[i+0]);
280
    }
281
}
282

    
283
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
284
{
285
    int s, i;
286
    uint32_t *sq = ff_squareTbl + 256;
287

    
288
    s = 0;
289
    for (i = 0; i < h; i++) {
290
        s += sq[pix1[0] - pix2[0]];
291
        s += sq[pix1[1] - pix2[1]];
292
        s += sq[pix1[2] - pix2[2]];
293
        s += sq[pix1[3] - pix2[3]];
294
        pix1 += line_size;
295
        pix2 += line_size;
296
    }
297
    return s;
298
}
299

    
300
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
301
{
302
    int s, i;
303
    uint32_t *sq = ff_squareTbl + 256;
304

    
305
    s = 0;
306
    for (i = 0; i < h; i++) {
307
        s += sq[pix1[0] - pix2[0]];
308
        s += sq[pix1[1] - pix2[1]];
309
        s += sq[pix1[2] - pix2[2]];
310
        s += sq[pix1[3] - pix2[3]];
311
        s += sq[pix1[4] - pix2[4]];
312
        s += sq[pix1[5] - pix2[5]];
313
        s += sq[pix1[6] - pix2[6]];
314
        s += sq[pix1[7] - pix2[7]];
315
        pix1 += line_size;
316
        pix2 += line_size;
317
    }
318
    return s;
319
}
320

    
321
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
322
{
323
    int s, i;
324
    uint32_t *sq = ff_squareTbl + 256;
325

    
326
    s = 0;
327
    for (i = 0; i < h; i++) {
328
        s += sq[pix1[ 0] - pix2[ 0]];
329
        s += sq[pix1[ 1] - pix2[ 1]];
330
        s += sq[pix1[ 2] - pix2[ 2]];
331
        s += sq[pix1[ 3] - pix2[ 3]];
332
        s += sq[pix1[ 4] - pix2[ 4]];
333
        s += sq[pix1[ 5] - pix2[ 5]];
334
        s += sq[pix1[ 6] - pix2[ 6]];
335
        s += sq[pix1[ 7] - pix2[ 7]];
336
        s += sq[pix1[ 8] - pix2[ 8]];
337
        s += sq[pix1[ 9] - pix2[ 9]];
338
        s += sq[pix1[10] - pix2[10]];
339
        s += sq[pix1[11] - pix2[11]];
340
        s += sq[pix1[12] - pix2[12]];
341
        s += sq[pix1[13] - pix2[13]];
342
        s += sq[pix1[14] - pix2[14]];
343
        s += sq[pix1[15] - pix2[15]];
344

    
345
        pix1 += line_size;
346
        pix2 += line_size;
347
    }
348
    return s;
349
}
350

    
351

    
352
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
353
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
354
    int s, i, j;
355
    const int dec_count= w==8 ? 3 : 4;
356
    int tmp[32*32];
357
    int level, ori;
358
    static const int scale[2][2][4][4]={
359
      {
360
        {
361
            // 9/7 8x8 dec=3
362
            {268, 239, 239, 213},
363
            {  0, 224, 224, 152},
364
            {  0, 135, 135, 110},
365
        },{
366
            // 9/7 16x16 or 32x32 dec=4
367
            {344, 310, 310, 280},
368
            {  0, 320, 320, 228},
369
            {  0, 175, 175, 136},
370
            {  0, 129, 129, 102},
371
        }
372
      },{
373
        {
374
            // 5/3 8x8 dec=3
375
            {275, 245, 245, 218},
376
            {  0, 230, 230, 156},
377
            {  0, 138, 138, 113},
378
        },{
379
            // 5/3 16x16 or 32x32 dec=4
380
            {352, 317, 317, 286},
381
            {  0, 328, 328, 233},
382
            {  0, 180, 180, 140},
383
            {  0, 132, 132, 105},
384
        }
385
      }
386
    };
387

    
388
    for (i = 0; i < h; i++) {
389
        for (j = 0; j < w; j+=4) {
390
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
391
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
392
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
393
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
394
        }
395
        pix1 += line_size;
396
        pix2 += line_size;
397
    }
398

    
399
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
400

    
401
    s=0;
402
    assert(w==h);
403
    for(level=0; level<dec_count; level++){
404
        for(ori= level ? 1 : 0; ori<4; ori++){
405
            int size= w>>(dec_count-level);
406
            int sx= (ori&1) ? size : 0;
407
            int stride= 32<<(dec_count-level);
408
            int sy= (ori&2) ? stride>>1 : 0;
409

    
410
            for(i=0; i<size; i++){
411
                for(j=0; j<size; j++){
412
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
413
                    s += FFABS(v);
414
                }
415
            }
416
        }
417
    }
418
    assert(s>=0);
419
    return s>>9;
420
}
421

    
422
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
424
}
425

    
426
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
428
}
429

    
430
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
432
}
433

    
434
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
436
}
437

    
438
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
439
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
440
}
441

    
442
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
443
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
444
}
445
#endif
446

    
447
/* draw the edges of width 'w' of an image of size width, height */
448
//FIXME check that this is ok for mpeg4 interlaced
449
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
450
{
451
    uint8_t *ptr, *last_line;
452
    int i;
453

    
454
    last_line = buf + (height - 1) * wrap;
455
    for(i=0;i<w;i++) {
456
        /* top and bottom */
457
        memcpy(buf - (i + 1) * wrap, buf, width);
458
        memcpy(last_line + (i + 1) * wrap, last_line, width);
459
    }
460
    /* left and right */
461
    ptr = buf;
462
    for(i=0;i<height;i++) {
463
        memset(ptr - w, ptr[0], w);
464
        memset(ptr + width, ptr[width-1], w);
465
        ptr += wrap;
466
    }
467
    /* corners */
468
    for(i=0;i<w;i++) {
469
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
470
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
471
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
472
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
473
    }
474
}
475

    
476
/**
477
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
478
 * @param buf destination buffer
479
 * @param src source buffer
480
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
481
 * @param block_w width of block
482
 * @param block_h height of block
483
 * @param src_x x coordinate of the top left sample of the block in the source buffer
484
 * @param src_y y coordinate of the top left sample of the block in the source buffer
485
 * @param w width of the source buffer
486
 * @param h height of the source buffer
487
 */
488
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
489
                                    int src_x, int src_y, int w, int h){
490
    int x, y;
491
    int start_y, start_x, end_y, end_x;
492

    
493
    if(src_y>= h){
494
        src+= (h-1-src_y)*linesize;
495
        src_y=h-1;
496
    }else if(src_y<=-block_h){
497
        src+= (1-block_h-src_y)*linesize;
498
        src_y=1-block_h;
499
    }
500
    if(src_x>= w){
501
        src+= (w-1-src_x);
502
        src_x=w-1;
503
    }else if(src_x<=-block_w){
504
        src+= (1-block_w-src_x);
505
        src_x=1-block_w;
506
    }
507

    
508
    start_y= FFMAX(0, -src_y);
509
    start_x= FFMAX(0, -src_x);
510
    end_y= FFMIN(block_h, h-src_y);
511
    end_x= FFMIN(block_w, w-src_x);
512

    
513
    // copy existing part
514
    for(y=start_y; y<end_y; y++){
515
        for(x=start_x; x<end_x; x++){
516
            buf[x + y*linesize]= src[x + y*linesize];
517
        }
518
    }
519

    
520
    //top
521
    for(y=0; y<start_y; y++){
522
        for(x=start_x; x<end_x; x++){
523
            buf[x + y*linesize]= buf[x + start_y*linesize];
524
        }
525
    }
526

    
527
    //bottom
528
    for(y=end_y; y<block_h; y++){
529
        for(x=start_x; x<end_x; x++){
530
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
531
        }
532
    }
533

    
534
    for(y=0; y<block_h; y++){
535
       //left
536
        for(x=0; x<start_x; x++){
537
            buf[x + y*linesize]= buf[start_x + y*linesize];
538
        }
539

    
540
       //right
541
        for(x=end_x; x<block_w; x++){
542
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
543
        }
544
    }
545
}
546

    
547
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
548
{
549
    int i;
550

    
551
    /* read the pixels */
552
    for(i=0;i<8;i++) {
553
        block[0] = pixels[0];
554
        block[1] = pixels[1];
555
        block[2] = pixels[2];
556
        block[3] = pixels[3];
557
        block[4] = pixels[4];
558
        block[5] = pixels[5];
559
        block[6] = pixels[6];
560
        block[7] = pixels[7];
561
        pixels += line_size;
562
        block += 8;
563
    }
564
}
565

    
566
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
567
                          const uint8_t *s2, int stride){
568
    int i;
569

    
570
    /* read the pixels */
571
    for(i=0;i<8;i++) {
572
        block[0] = s1[0] - s2[0];
573
        block[1] = s1[1] - s2[1];
574
        block[2] = s1[2] - s2[2];
575
        block[3] = s1[3] - s2[3];
576
        block[4] = s1[4] - s2[4];
577
        block[5] = s1[5] - s2[5];
578
        block[6] = s1[6] - s2[6];
579
        block[7] = s1[7] - s2[7];
580
        s1 += stride;
581
        s2 += stride;
582
        block += 8;
583
    }
584
}
585

    
586

    
587
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
588
                                 int line_size)
589
{
590
    int i;
591
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
592

    
593
    /* read the pixels */
594
    for(i=0;i<8;i++) {
595
        pixels[0] = cm[block[0]];
596
        pixels[1] = cm[block[1]];
597
        pixels[2] = cm[block[2]];
598
        pixels[3] = cm[block[3]];
599
        pixels[4] = cm[block[4]];
600
        pixels[5] = cm[block[5]];
601
        pixels[6] = cm[block[6]];
602
        pixels[7] = cm[block[7]];
603

    
604
        pixels += line_size;
605
        block += 8;
606
    }
607
}
608

    
609
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
610
                                 int line_size)
611
{
612
    int i;
613
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
614

    
615
    /* read the pixels */
616
    for(i=0;i<4;i++) {
617
        pixels[0] = cm[block[0]];
618
        pixels[1] = cm[block[1]];
619
        pixels[2] = cm[block[2]];
620
        pixels[3] = cm[block[3]];
621

    
622
        pixels += line_size;
623
        block += 8;
624
    }
625
}
626

    
627
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
628
                                 int line_size)
629
{
630
    int i;
631
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
632

    
633
    /* read the pixels */
634
    for(i=0;i<2;i++) {
635
        pixels[0] = cm[block[0]];
636
        pixels[1] = cm[block[1]];
637

    
638
        pixels += line_size;
639
        block += 8;
640
    }
641
}
642

    
643
static void put_signed_pixels_clamped_c(const DCTELEM *block,
644
                                        uint8_t *restrict pixels,
645
                                        int line_size)
646
{
647
    int i, j;
648

    
649
    for (i = 0; i < 8; i++) {
650
        for (j = 0; j < 8; j++) {
651
            if (*block < -128)
652
                *pixels = 0;
653
            else if (*block > 127)
654
                *pixels = 255;
655
            else
656
                *pixels = (uint8_t)(*block + 128);
657
            block++;
658
            pixels++;
659
        }
660
        pixels += (line_size - 8);
661
    }
662
}
663

    
664
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
665
                                    int line_size)
666
{
667
    int i;
668

    
669
    /* read the pixels */
670
    for(i=0;i<8;i++) {
671
        pixels[0] = block[0];
672
        pixels[1] = block[1];
673
        pixels[2] = block[2];
674
        pixels[3] = block[3];
675
        pixels[4] = block[4];
676
        pixels[5] = block[5];
677
        pixels[6] = block[6];
678
        pixels[7] = block[7];
679

    
680
        pixels += line_size;
681
        block += 8;
682
    }
683
}
684

    
685
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
686
                          int line_size)
687
{
688
    int i;
689
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
690

    
691
    /* read the pixels */
692
    for(i=0;i<8;i++) {
693
        pixels[0] = cm[pixels[0] + block[0]];
694
        pixels[1] = cm[pixels[1] + block[1]];
695
        pixels[2] = cm[pixels[2] + block[2]];
696
        pixels[3] = cm[pixels[3] + block[3]];
697
        pixels[4] = cm[pixels[4] + block[4]];
698
        pixels[5] = cm[pixels[5] + block[5]];
699
        pixels[6] = cm[pixels[6] + block[6]];
700
        pixels[7] = cm[pixels[7] + block[7]];
701
        pixels += line_size;
702
        block += 8;
703
    }
704
}
705

    
706
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
707
                          int line_size)
708
{
709
    int i;
710
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
711

    
712
    /* read the pixels */
713
    for(i=0;i<4;i++) {
714
        pixels[0] = cm[pixels[0] + block[0]];
715
        pixels[1] = cm[pixels[1] + block[1]];
716
        pixels[2] = cm[pixels[2] + block[2]];
717
        pixels[3] = cm[pixels[3] + block[3]];
718
        pixels += line_size;
719
        block += 8;
720
    }
721
}
722

    
723
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
724
                          int line_size)
725
{
726
    int i;
727
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
728

    
729
    /* read the pixels */
730
    for(i=0;i<2;i++) {
731
        pixels[0] = cm[pixels[0] + block[0]];
732
        pixels[1] = cm[pixels[1] + block[1]];
733
        pixels += line_size;
734
        block += 8;
735
    }
736
}
737

    
738
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
739
{
740
    int i;
741
    for(i=0;i<8;i++) {
742
        pixels[0] += block[0];
743
        pixels[1] += block[1];
744
        pixels[2] += block[2];
745
        pixels[3] += block[3];
746
        pixels[4] += block[4];
747
        pixels[5] += block[5];
748
        pixels[6] += block[6];
749
        pixels[7] += block[7];
750
        pixels += line_size;
751
        block += 8;
752
    }
753
}
754

    
755
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
756
{
757
    int i;
758
    for(i=0;i<4;i++) {
759
        pixels[0] += block[0];
760
        pixels[1] += block[1];
761
        pixels[2] += block[2];
762
        pixels[3] += block[3];
763
        pixels += line_size;
764
        block += 4;
765
    }
766
}
767

    
768
static int sum_abs_dctelem_c(DCTELEM *block)
769
{
770
    int sum=0, i;
771
    for(i=0; i<64; i++)
772
        sum+= FFABS(block[i]);
773
    return sum;
774
}
775

    
776
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
777
{
778
    int i;
779

    
780
    for (i = 0; i < h; i++) {
781
        memset(block, value, 16);
782
        block += line_size;
783
    }
784
}
785

    
786
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
787
{
788
    int i;
789

    
790
    for (i = 0; i < h; i++) {
791
        memset(block, value, 8);
792
        block += line_size;
793
    }
794
}
795

    
796
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
797
{
798
    int i, j;
799
    uint16_t *dst1 = dst;
800
    uint16_t *dst2 = dst + linesize;
801

    
802
    for (j = 0; j < 8; j++) {
803
        for (i = 0; i < 8; i++) {
804
            dst1[i] = dst2[i] = src[i] * 0x0101;
805
        }
806
        src  += 8;
807
        dst1 += linesize;
808
        dst2 += linesize;
809
    }
810
}
811

    
812
#if 0
813

814
#define PIXOP2(OPNAME, OP) \
815
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
816
{\
817
    int i;\
818
    for(i=0; i<h; i++){\
819
        OP(*((uint64_t*)block), AV_RN64(pixels));\
820
        pixels+=line_size;\
821
        block +=line_size;\
822
    }\
823
}\
824
\
825
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
826
{\
827
    int i;\
828
    for(i=0; i<h; i++){\
829
        const uint64_t a= AV_RN64(pixels  );\
830
        const uint64_t b= AV_RN64(pixels+1);\
831
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
832
        pixels+=line_size;\
833
        block +=line_size;\
834
    }\
835
}\
836
\
837
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
838
{\
839
    int i;\
840
    for(i=0; i<h; i++){\
841
        const uint64_t a= AV_RN64(pixels  );\
842
        const uint64_t b= AV_RN64(pixels+1);\
843
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
844
        pixels+=line_size;\
845
        block +=line_size;\
846
    }\
847
}\
848
\
849
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
850
{\
851
    int i;\
852
    for(i=0; i<h; i++){\
853
        const uint64_t a= AV_RN64(pixels          );\
854
        const uint64_t b= AV_RN64(pixels+line_size);\
855
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
856
        pixels+=line_size;\
857
        block +=line_size;\
858
    }\
859
}\
860
\
861
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
862
{\
863
    int i;\
864
    for(i=0; i<h; i++){\
865
        const uint64_t a= AV_RN64(pixels          );\
866
        const uint64_t b= AV_RN64(pixels+line_size);\
867
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
868
        pixels+=line_size;\
869
        block +=line_size;\
870
    }\
871
}\
872
\
873
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
874
{\
875
        int i;\
876
        const uint64_t a= AV_RN64(pixels  );\
877
        const uint64_t b= AV_RN64(pixels+1);\
878
        uint64_t l0=  (a&0x0303030303030303ULL)\
879
                    + (b&0x0303030303030303ULL)\
880
                    + 0x0202020202020202ULL;\
881
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
882
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
883
        uint64_t l1,h1;\
884
\
885
        pixels+=line_size;\
886
        for(i=0; i<h; i+=2){\
887
            uint64_t a= AV_RN64(pixels  );\
888
            uint64_t b= AV_RN64(pixels+1);\
889
            l1=  (a&0x0303030303030303ULL)\
890
               + (b&0x0303030303030303ULL);\
891
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
892
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
893
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
894
            pixels+=line_size;\
895
            block +=line_size;\
896
            a= AV_RN64(pixels  );\
897
            b= AV_RN64(pixels+1);\
898
            l0=  (a&0x0303030303030303ULL)\
899
               + (b&0x0303030303030303ULL)\
900
               + 0x0202020202020202ULL;\
901
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
902
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
903
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
904
            pixels+=line_size;\
905
            block +=line_size;\
906
        }\
907
}\
908
\
909
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
910
{\
911
        int i;\
912
        const uint64_t a= AV_RN64(pixels  );\
913
        const uint64_t b= AV_RN64(pixels+1);\
914
        uint64_t l0=  (a&0x0303030303030303ULL)\
915
                    + (b&0x0303030303030303ULL)\
916
                    + 0x0101010101010101ULL;\
917
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
918
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
919
        uint64_t l1,h1;\
920
\
921
        pixels+=line_size;\
922
        for(i=0; i<h; i+=2){\
923
            uint64_t a= AV_RN64(pixels  );\
924
            uint64_t b= AV_RN64(pixels+1);\
925
            l1=  (a&0x0303030303030303ULL)\
926
               + (b&0x0303030303030303ULL);\
927
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
928
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
929
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
930
            pixels+=line_size;\
931
            block +=line_size;\
932
            a= AV_RN64(pixels  );\
933
            b= AV_RN64(pixels+1);\
934
            l0=  (a&0x0303030303030303ULL)\
935
               + (b&0x0303030303030303ULL)\
936
               + 0x0101010101010101ULL;\
937
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
938
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
939
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
940
            pixels+=line_size;\
941
            block +=line_size;\
942
        }\
943
}\
944
\
945
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
946
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
947
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
948
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
949
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
950
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
951
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
952

953
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
954
#else // 64 bit variant
955

    
956
#define PIXOP2(OPNAME, OP) \
957
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958
    int i;\
959
    for(i=0; i<h; i++){\
960
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
961
        pixels+=line_size;\
962
        block +=line_size;\
963
    }\
964
}\
965
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966
    int i;\
967
    for(i=0; i<h; i++){\
968
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
969
        pixels+=line_size;\
970
        block +=line_size;\
971
    }\
972
}\
973
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
974
    int i;\
975
    for(i=0; i<h; i++){\
976
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
977
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
978
        pixels+=line_size;\
979
        block +=line_size;\
980
    }\
981
}\
982
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
984
}\
985
\
986
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
987
                                                int src_stride1, int src_stride2, int h){\
988
    int i;\
989
    for(i=0; i<h; i++){\
990
        uint32_t a,b;\
991
        a= AV_RN32(&src1[i*src_stride1  ]);\
992
        b= AV_RN32(&src2[i*src_stride2  ]);\
993
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
994
        a= AV_RN32(&src1[i*src_stride1+4]);\
995
        b= AV_RN32(&src2[i*src_stride2+4]);\
996
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
997
    }\
998
}\
999
\
1000
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1001
                                                int src_stride1, int src_stride2, int h){\
1002
    int i;\
1003
    for(i=0; i<h; i++){\
1004
        uint32_t a,b;\
1005
        a= AV_RN32(&src1[i*src_stride1  ]);\
1006
        b= AV_RN32(&src2[i*src_stride2  ]);\
1007
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1008
        a= AV_RN32(&src1[i*src_stride1+4]);\
1009
        b= AV_RN32(&src2[i*src_stride2+4]);\
1010
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
1011
    }\
1012
}\
1013
\
1014
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1015
                                                int src_stride1, int src_stride2, int h){\
1016
    int i;\
1017
    for(i=0; i<h; i++){\
1018
        uint32_t a,b;\
1019
        a= AV_RN32(&src1[i*src_stride1  ]);\
1020
        b= AV_RN32(&src2[i*src_stride2  ]);\
1021
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1022
    }\
1023
}\
1024
\
1025
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1026
                                                int src_stride1, int src_stride2, int h){\
1027
    int i;\
1028
    for(i=0; i<h; i++){\
1029
        uint32_t a,b;\
1030
        a= AV_RN16(&src1[i*src_stride1  ]);\
1031
        b= AV_RN16(&src2[i*src_stride2  ]);\
1032
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
1033
    }\
1034
}\
1035
\
1036
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1037
                                                int src_stride1, int src_stride2, int h){\
1038
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
1039
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1040
}\
1041
\
1042
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1043
                                                int src_stride1, int src_stride2, int h){\
1044
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
1045
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1046
}\
1047
\
1048
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1049
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1050
}\
1051
\
1052
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1053
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1054
}\
1055
\
1056
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1057
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1058
}\
1059
\
1060
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1061
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1062
}\
1063
\
1064
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1065
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1066
    int i;\
1067
    for(i=0; i<h; i++){\
1068
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1069
        a= AV_RN32(&src1[i*src_stride1]);\
1070
        b= AV_RN32(&src2[i*src_stride2]);\
1071
        c= AV_RN32(&src3[i*src_stride3]);\
1072
        d= AV_RN32(&src4[i*src_stride4]);\
1073
        l0=  (a&0x03030303UL)\
1074
           + (b&0x03030303UL)\
1075
           + 0x02020202UL;\
1076
        h0= ((a&0xFCFCFCFCUL)>>2)\
1077
          + ((b&0xFCFCFCFCUL)>>2);\
1078
        l1=  (c&0x03030303UL)\
1079
           + (d&0x03030303UL);\
1080
        h1= ((c&0xFCFCFCFCUL)>>2)\
1081
          + ((d&0xFCFCFCFCUL)>>2);\
1082
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083
        a= AV_RN32(&src1[i*src_stride1+4]);\
1084
        b= AV_RN32(&src2[i*src_stride2+4]);\
1085
        c= AV_RN32(&src3[i*src_stride3+4]);\
1086
        d= AV_RN32(&src4[i*src_stride4+4]);\
1087
        l0=  (a&0x03030303UL)\
1088
           + (b&0x03030303UL)\
1089
           + 0x02020202UL;\
1090
        h0= ((a&0xFCFCFCFCUL)>>2)\
1091
          + ((b&0xFCFCFCFCUL)>>2);\
1092
        l1=  (c&0x03030303UL)\
1093
           + (d&0x03030303UL);\
1094
        h1= ((c&0xFCFCFCFCUL)>>2)\
1095
          + ((d&0xFCFCFCFCUL)>>2);\
1096
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1097
    }\
1098
}\
1099
\
1100
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1101
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1102
}\
1103
\
1104
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1105
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1106
}\
1107
\
1108
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1109
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1110
}\
1111
\
1112
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1113
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1114
}\
1115
\
1116
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1117
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1118
    int i;\
1119
    for(i=0; i<h; i++){\
1120
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1121
        a= AV_RN32(&src1[i*src_stride1]);\
1122
        b= AV_RN32(&src2[i*src_stride2]);\
1123
        c= AV_RN32(&src3[i*src_stride3]);\
1124
        d= AV_RN32(&src4[i*src_stride4]);\
1125
        l0=  (a&0x03030303UL)\
1126
           + (b&0x03030303UL)\
1127
           + 0x01010101UL;\
1128
        h0= ((a&0xFCFCFCFCUL)>>2)\
1129
          + ((b&0xFCFCFCFCUL)>>2);\
1130
        l1=  (c&0x03030303UL)\
1131
           + (d&0x03030303UL);\
1132
        h1= ((c&0xFCFCFCFCUL)>>2)\
1133
          + ((d&0xFCFCFCFCUL)>>2);\
1134
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1135
        a= AV_RN32(&src1[i*src_stride1+4]);\
1136
        b= AV_RN32(&src2[i*src_stride2+4]);\
1137
        c= AV_RN32(&src3[i*src_stride3+4]);\
1138
        d= AV_RN32(&src4[i*src_stride4+4]);\
1139
        l0=  (a&0x03030303UL)\
1140
           + (b&0x03030303UL)\
1141
           + 0x01010101UL;\
1142
        h0= ((a&0xFCFCFCFCUL)>>2)\
1143
          + ((b&0xFCFCFCFCUL)>>2);\
1144
        l1=  (c&0x03030303UL)\
1145
           + (d&0x03030303UL);\
1146
        h1= ((c&0xFCFCFCFCUL)>>2)\
1147
          + ((d&0xFCFCFCFCUL)>>2);\
1148
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1149
    }\
1150
}\
1151
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1152
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1153
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1154
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1155
}\
1156
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1157
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1158
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1159
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1160
}\
1161
\
1162
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1163
{\
1164
        int i, a0, b0, a1, b1;\
1165
        a0= pixels[0];\
1166
        b0= pixels[1] + 2;\
1167
        a0 += b0;\
1168
        b0 += pixels[2];\
1169
\
1170
        pixels+=line_size;\
1171
        for(i=0; i<h; i+=2){\
1172
            a1= pixels[0];\
1173
            b1= pixels[1];\
1174
            a1 += b1;\
1175
            b1 += pixels[2];\
1176
\
1177
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1178
            block[1]= (b1+b0)>>2;\
1179
\
1180
            pixels+=line_size;\
1181
            block +=line_size;\
1182
\
1183
            a0= pixels[0];\
1184
            b0= pixels[1] + 2;\
1185
            a0 += b0;\
1186
            b0 += pixels[2];\
1187
\
1188
            block[0]= (a1+a0)>>2;\
1189
            block[1]= (b1+b0)>>2;\
1190
            pixels+=line_size;\
1191
            block +=line_size;\
1192
        }\
1193
}\
1194
\
1195
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1196
{\
1197
        int i;\
1198
        const uint32_t a= AV_RN32(pixels  );\
1199
        const uint32_t b= AV_RN32(pixels+1);\
1200
        uint32_t l0=  (a&0x03030303UL)\
1201
                    + (b&0x03030303UL)\
1202
                    + 0x02020202UL;\
1203
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1204
                   + ((b&0xFCFCFCFCUL)>>2);\
1205
        uint32_t l1,h1;\
1206
\
1207
        pixels+=line_size;\
1208
        for(i=0; i<h; i+=2){\
1209
            uint32_t a= AV_RN32(pixels  );\
1210
            uint32_t b= AV_RN32(pixels+1);\
1211
            l1=  (a&0x03030303UL)\
1212
               + (b&0x03030303UL);\
1213
            h1= ((a&0xFCFCFCFCUL)>>2)\
1214
              + ((b&0xFCFCFCFCUL)>>2);\
1215
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1216
            pixels+=line_size;\
1217
            block +=line_size;\
1218
            a= AV_RN32(pixels  );\
1219
            b= AV_RN32(pixels+1);\
1220
            l0=  (a&0x03030303UL)\
1221
               + (b&0x03030303UL)\
1222
               + 0x02020202UL;\
1223
            h0= ((a&0xFCFCFCFCUL)>>2)\
1224
              + ((b&0xFCFCFCFCUL)>>2);\
1225
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1226
            pixels+=line_size;\
1227
            block +=line_size;\
1228
        }\
1229
}\
1230
\
1231
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1232
{\
1233
    int j;\
1234
    for(j=0; j<2; j++){\
1235
        int i;\
1236
        const uint32_t a= AV_RN32(pixels  );\
1237
        const uint32_t b= AV_RN32(pixels+1);\
1238
        uint32_t l0=  (a&0x03030303UL)\
1239
                    + (b&0x03030303UL)\
1240
                    + 0x02020202UL;\
1241
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1242
                   + ((b&0xFCFCFCFCUL)>>2);\
1243
        uint32_t l1,h1;\
1244
\
1245
        pixels+=line_size;\
1246
        for(i=0; i<h; i+=2){\
1247
            uint32_t a= AV_RN32(pixels  );\
1248
            uint32_t b= AV_RN32(pixels+1);\
1249
            l1=  (a&0x03030303UL)\
1250
               + (b&0x03030303UL);\
1251
            h1= ((a&0xFCFCFCFCUL)>>2)\
1252
              + ((b&0xFCFCFCFCUL)>>2);\
1253
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1254
            pixels+=line_size;\
1255
            block +=line_size;\
1256
            a= AV_RN32(pixels  );\
1257
            b= AV_RN32(pixels+1);\
1258
            l0=  (a&0x03030303UL)\
1259
               + (b&0x03030303UL)\
1260
               + 0x02020202UL;\
1261
            h0= ((a&0xFCFCFCFCUL)>>2)\
1262
              + ((b&0xFCFCFCFCUL)>>2);\
1263
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1264
            pixels+=line_size;\
1265
            block +=line_size;\
1266
        }\
1267
        pixels+=4-line_size*(h+1);\
1268
        block +=4-line_size*h;\
1269
    }\
1270
}\
1271
\
1272
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1273
{\
1274
    int j;\
1275
    for(j=0; j<2; j++){\
1276
        int i;\
1277
        const uint32_t a= AV_RN32(pixels  );\
1278
        const uint32_t b= AV_RN32(pixels+1);\
1279
        uint32_t l0=  (a&0x03030303UL)\
1280
                    + (b&0x03030303UL)\
1281
                    + 0x01010101UL;\
1282
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1283
                   + ((b&0xFCFCFCFCUL)>>2);\
1284
        uint32_t l1,h1;\
1285
\
1286
        pixels+=line_size;\
1287
        for(i=0; i<h; i+=2){\
1288
            uint32_t a= AV_RN32(pixels  );\
1289
            uint32_t b= AV_RN32(pixels+1);\
1290
            l1=  (a&0x03030303UL)\
1291
               + (b&0x03030303UL);\
1292
            h1= ((a&0xFCFCFCFCUL)>>2)\
1293
              + ((b&0xFCFCFCFCUL)>>2);\
1294
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1295
            pixels+=line_size;\
1296
            block +=line_size;\
1297
            a= AV_RN32(pixels  );\
1298
            b= AV_RN32(pixels+1);\
1299
            l0=  (a&0x03030303UL)\
1300
               + (b&0x03030303UL)\
1301
               + 0x01010101UL;\
1302
            h0= ((a&0xFCFCFCFCUL)>>2)\
1303
              + ((b&0xFCFCFCFCUL)>>2);\
1304
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1305
            pixels+=line_size;\
1306
            block +=line_size;\
1307
        }\
1308
        pixels+=4-line_size*(h+1);\
1309
        block +=4-line_size*h;\
1310
    }\
1311
}\
1312
\
1313
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1314
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1315
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1316
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1317
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1318
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1319
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1320
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1321

    
1322
#define op_avg(a, b) a = rnd_avg32(a, b)
1323
#endif
1324
#define op_put(a, b) a = b
1325

    
1326
PIXOP2(avg, op_avg)
1327
PIXOP2(put, op_put)
1328
#undef op_avg
1329
#undef op_put
1330

    
1331
#define avg2(a,b) ((a+b+1)>>1)
1332
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1333

    
1334
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1335
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1336
}
1337

    
1338
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1339
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1340
}
1341

    
1342
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1343
{
1344
    const int A=(16-x16)*(16-y16);
1345
    const int B=(   x16)*(16-y16);
1346
    const int C=(16-x16)*(   y16);
1347
    const int D=(   x16)*(   y16);
1348
    int i;
1349

    
1350
    for(i=0; i<h; i++)
1351
    {
1352
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1353
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1354
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1355
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1356
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1357
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1358
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1359
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1360
        dst+= stride;
1361
        src+= stride;
1362
    }
1363
}
1364

    
1365
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1366
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1367
{
1368
    int y, vx, vy;
1369
    const int s= 1<<shift;
1370

    
1371
    width--;
1372
    height--;
1373

    
1374
    for(y=0; y<h; y++){
1375
        int x;
1376

    
1377
        vx= ox;
1378
        vy= oy;
1379
        for(x=0; x<8; x++){ //XXX FIXME optimize
1380
            int src_x, src_y, frac_x, frac_y, index;
1381

    
1382
            src_x= vx>>16;
1383
            src_y= vy>>16;
1384
            frac_x= src_x&(s-1);
1385
            frac_y= src_y&(s-1);
1386
            src_x>>=shift;
1387
            src_y>>=shift;
1388

    
1389
            if((unsigned)src_x < width){
1390
                if((unsigned)src_y < height){
1391
                    index= src_x + src_y*stride;
1392
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1393
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1394
                                        + (  src[index+stride  ]*(s-frac_x)
1395
                                           + src[index+stride+1]*   frac_x )*   frac_y
1396
                                        + r)>>(shift*2);
1397
                }else{
1398
                    index= src_x + av_clip(src_y, 0, height)*stride;
1399
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1400
                                          + src[index       +1]*   frac_x )*s
1401
                                        + r)>>(shift*2);
1402
                }
1403
            }else{
1404
                if((unsigned)src_y < height){
1405
                    index= av_clip(src_x, 0, width) + src_y*stride;
1406
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1407
                                           + src[index+stride  ]*   frac_y )*s
1408
                                        + r)>>(shift*2);
1409
                }else{
1410
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1411
                    dst[y*stride + x]=    src[index         ];
1412
                }
1413
            }
1414

    
1415
            vx+= dxx;
1416
            vy+= dyx;
1417
        }
1418
        ox += dxy;
1419
        oy += dyy;
1420
    }
1421
}
1422

    
1423
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1424
    switch(width){
1425
    case 2: put_pixels2_c (dst, src, stride, height); break;
1426
    case 4: put_pixels4_c (dst, src, stride, height); break;
1427
    case 8: put_pixels8_c (dst, src, stride, height); break;
1428
    case 16:put_pixels16_c(dst, src, stride, height); break;
1429
    }
1430
}
1431

    
1432
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433
    int i,j;
1434
    for (i=0; i < height; i++) {
1435
      for (j=0; j < width; j++) {
1436
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1437
      }
1438
      src += stride;
1439
      dst += stride;
1440
    }
1441
}
1442

    
1443
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444
    int i,j;
1445
    for (i=0; i < height; i++) {
1446
      for (j=0; j < width; j++) {
1447
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1448
      }
1449
      src += stride;
1450
      dst += stride;
1451
    }
1452
}
1453

    
1454
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455
    int i,j;
1456
    for (i=0; i < height; i++) {
1457
      for (j=0; j < width; j++) {
1458
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1459
      }
1460
      src += stride;
1461
      dst += stride;
1462
    }
1463
}
1464

    
1465
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1466
    int i,j;
1467
    for (i=0; i < height; i++) {
1468
      for (j=0; j < width; j++) {
1469
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1470
      }
1471
      src += stride;
1472
      dst += stride;
1473
    }
1474
}
1475

    
1476
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1477
    int i,j;
1478
    for (i=0; i < height; i++) {
1479
      for (j=0; j < width; j++) {
1480
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1481
      }
1482
      src += stride;
1483
      dst += stride;
1484
    }
1485
}
1486

    
1487
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1488
    int i,j;
1489
    for (i=0; i < height; i++) {
1490
      for (j=0; j < width; j++) {
1491
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1492
      }
1493
      src += stride;
1494
      dst += stride;
1495
    }
1496
}
1497

    
1498
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1499
    int i,j;
1500
    for (i=0; i < height; i++) {
1501
      for (j=0; j < width; j++) {
1502
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1503
      }
1504
      src += stride;
1505
      dst += stride;
1506
    }
1507
}
1508

    
1509
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1510
    int i,j;
1511
    for (i=0; i < height; i++) {
1512
      for (j=0; j < width; j++) {
1513
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1514
      }
1515
      src += stride;
1516
      dst += stride;
1517
    }
1518
}
1519

    
1520
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1521
    switch(width){
1522
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1523
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1524
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1525
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1526
    }
1527
}
1528

    
1529
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530
    int i,j;
1531
    for (i=0; i < height; i++) {
1532
      for (j=0; j < width; j++) {
1533
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1534
      }
1535
      src += stride;
1536
      dst += stride;
1537
    }
1538
}
1539

    
1540
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541
    int i,j;
1542
    for (i=0; i < height; i++) {
1543
      for (j=0; j < width; j++) {
1544
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1545
      }
1546
      src += stride;
1547
      dst += stride;
1548
    }
1549
}
1550

    
1551
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1552
    int i,j;
1553
    for (i=0; i < height; i++) {
1554
      for (j=0; j < width; j++) {
1555
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1556
      }
1557
      src += stride;
1558
      dst += stride;
1559
    }
1560
}
1561

    
1562
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1563
    int i,j;
1564
    for (i=0; i < height; i++) {
1565
      for (j=0; j < width; j++) {
1566
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1567
      }
1568
      src += stride;
1569
      dst += stride;
1570
    }
1571
}
1572

    
1573
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1574
    int i,j;
1575
    for (i=0; i < height; i++) {
1576
      for (j=0; j < width; j++) {
1577
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1578
      }
1579
      src += stride;
1580
      dst += stride;
1581
    }
1582
}
1583

    
1584
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1585
    int i,j;
1586
    for (i=0; i < height; i++) {
1587
      for (j=0; j < width; j++) {
1588
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1589
      }
1590
      src += stride;
1591
      dst += stride;
1592
    }
1593
}
1594

    
1595
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1596
    int i,j;
1597
    for (i=0; i < height; i++) {
1598
      for (j=0; j < width; j++) {
1599
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1600
      }
1601
      src += stride;
1602
      dst += stride;
1603
    }
1604
}
1605

    
1606
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1607
    int i,j;
1608
    for (i=0; i < height; i++) {
1609
      for (j=0; j < width; j++) {
1610
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1611
      }
1612
      src += stride;
1613
      dst += stride;
1614
    }
1615
}
1616
#if 0
1617
#define TPEL_WIDTH(width)\
1618
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1620
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1621
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1622
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1623
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1624
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1625
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1626
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1627
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1628
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1629
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1630
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1631
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1632
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1633
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1634
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1635
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1636
#endif
1637

    
1638
#define H264_CHROMA_MC(OPNAME, OP)\
1639
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1640
    const int A=(8-x)*(8-y);\
1641
    const int B=(  x)*(8-y);\
1642
    const int C=(8-x)*(  y);\
1643
    const int D=(  x)*(  y);\
1644
    int i;\
1645
    \
1646
    assert(x<8 && y<8 && x>=0 && y>=0);\
1647
\
1648
    if(D){\
1649
        for(i=0; i<h; i++){\
1650
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1651
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1652
            dst+= stride;\
1653
            src+= stride;\
1654
        }\
1655
    }else{\
1656
        const int E= B+C;\
1657
        const int step= C ? stride : 1;\
1658
        for(i=0; i<h; i++){\
1659
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1660
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1661
            dst+= stride;\
1662
            src+= stride;\
1663
        }\
1664
    }\
1665
}\
1666
\
1667
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1668
    const int A=(8-x)*(8-y);\
1669
    const int B=(  x)*(8-y);\
1670
    const int C=(8-x)*(  y);\
1671
    const int D=(  x)*(  y);\
1672
    int i;\
1673
    \
1674
    assert(x<8 && y<8 && x>=0 && y>=0);\
1675
\
1676
    if(D){\
1677
        for(i=0; i<h; i++){\
1678
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1679
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1680
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1681
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1682
            dst+= stride;\
1683
            src+= stride;\
1684
        }\
1685
    }else{\
1686
        const int E= B+C;\
1687
        const int step= C ? stride : 1;\
1688
        for(i=0; i<h; i++){\
1689
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1690
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1691
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1692
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1693
            dst+= stride;\
1694
            src+= stride;\
1695
        }\
1696
    }\
1697
}\
1698
\
1699
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1700
    const int A=(8-x)*(8-y);\
1701
    const int B=(  x)*(8-y);\
1702
    const int C=(8-x)*(  y);\
1703
    const int D=(  x)*(  y);\
1704
    int i;\
1705
    \
1706
    assert(x<8 && y<8 && x>=0 && y>=0);\
1707
\
1708
    if(D){\
1709
        for(i=0; i<h; i++){\
1710
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1711
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1712
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1713
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1714
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1715
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1716
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1717
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1718
            dst+= stride;\
1719
            src+= stride;\
1720
        }\
1721
    }else{\
1722
        const int E= B+C;\
1723
        const int step= C ? stride : 1;\
1724
        for(i=0; i<h; i++){\
1725
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1726
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1727
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1728
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1729
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1730
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1731
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1732
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1733
            dst+= stride;\
1734
            src+= stride;\
1735
        }\
1736
    }\
1737
}
1738

    
1739
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1740
#define op_put(a, b) a = (((b) + 32)>>6)
1741

    
1742
H264_CHROMA_MC(put_       , op_put)
1743
H264_CHROMA_MC(avg_       , op_avg)
1744
#undef op_avg
1745
#undef op_put
1746

    
1747
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1748
    const int A=(8-x)*(8-y);
1749
    const int B=(  x)*(8-y);
1750
    const int C=(8-x)*(  y);
1751
    const int D=(  x)*(  y);
1752
    int i;
1753

    
1754
    assert(x<8 && y<8 && x>=0 && y>=0);
1755

    
1756
    for(i=0; i<h; i++)
1757
    {
1758
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1759
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1760
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1761
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1762
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1763
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1764
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1765
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1766
        dst+= stride;
1767
        src+= stride;
1768
    }
1769
}
1770

    
1771
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1772
    const int A=(8-x)*(8-y);
1773
    const int B=(  x)*(8-y);
1774
    const int C=(8-x)*(  y);
1775
    const int D=(  x)*(  y);
1776
    int i;
1777

    
1778
    assert(x<8 && y<8 && x>=0 && y>=0);
1779

    
1780
    for(i=0; i<h; i++)
1781
    {
1782
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1783
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1784
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1785
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1786
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1787
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1788
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1789
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1790
        dst+= stride;
1791
        src+= stride;
1792
    }
1793
}
1794

    
1795
#define QPEL_MC(r, OPNAME, RND, OP) \
1796
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1797
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1798
    int i;\
1799
    for(i=0; i<h; i++)\
1800
    {\
1801
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1802
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1803
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1804
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1805
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1806
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1807
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1808
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1809
        dst+=dstStride;\
1810
        src+=srcStride;\
1811
    }\
1812
}\
1813
\
1814
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1815
    const int w=8;\
1816
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1817
    int i;\
1818
    for(i=0; i<w; i++)\
1819
    {\
1820
        const int src0= src[0*srcStride];\
1821
        const int src1= src[1*srcStride];\
1822
        const int src2= src[2*srcStride];\
1823
        const int src3= src[3*srcStride];\
1824
        const int src4= src[4*srcStride];\
1825
        const int src5= src[5*srcStride];\
1826
        const int src6= src[6*srcStride];\
1827
        const int src7= src[7*srcStride];\
1828
        const int src8= src[8*srcStride];\
1829
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1830
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1831
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1832
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1833
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1834
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1835
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1836
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1837
        dst++;\
1838
        src++;\
1839
    }\
1840
}\
1841
\
1842
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1843
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1844
    int i;\
1845
    \
1846
    for(i=0; i<h; i++)\
1847
    {\
1848
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1849
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1850
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1851
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1852
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1853
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1854
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1855
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1856
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1857
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1858
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1859
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1860
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1861
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1862
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1863
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1864
        dst+=dstStride;\
1865
        src+=srcStride;\
1866
    }\
1867
}\
1868
\
1869
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1870
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1871
    int i;\
1872
    const int w=16;\
1873
    for(i=0; i<w; i++)\
1874
    {\
1875
        const int src0= src[0*srcStride];\
1876
        const int src1= src[1*srcStride];\
1877
        const int src2= src[2*srcStride];\
1878
        const int src3= src[3*srcStride];\
1879
        const int src4= src[4*srcStride];\
1880
        const int src5= src[5*srcStride];\
1881
        const int src6= src[6*srcStride];\
1882
        const int src7= src[7*srcStride];\
1883
        const int src8= src[8*srcStride];\
1884
        const int src9= src[9*srcStride];\
1885
        const int src10= src[10*srcStride];\
1886
        const int src11= src[11*srcStride];\
1887
        const int src12= src[12*srcStride];\
1888
        const int src13= src[13*srcStride];\
1889
        const int src14= src[14*srcStride];\
1890
        const int src15= src[15*srcStride];\
1891
        const int src16= src[16*srcStride];\
1892
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1893
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1894
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1895
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1896
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1897
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1898
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1899
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1900
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1901
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1902
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1903
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1904
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1905
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1906
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1907
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1908
        dst++;\
1909
        src++;\
1910
    }\
1911
}\
1912
\
1913
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1914
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1915
}\
1916
\
1917
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1918
    uint8_t half[64];\
1919
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1920
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1921
}\
1922
\
1923
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1924
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1925
}\
1926
\
1927
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t half[64];\
1929
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1930
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1931
}\
1932
\
1933
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1934
    uint8_t full[16*9];\
1935
    uint8_t half[64];\
1936
    copy_block9(full, src, 16, stride, 9);\
1937
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1938
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1939
}\
1940
\
1941
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1942
    uint8_t full[16*9];\
1943
    copy_block9(full, src, 16, stride, 9);\
1944
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1945
}\
1946
\
1947
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1948
    uint8_t full[16*9];\
1949
    uint8_t half[64];\
1950
    copy_block9(full, src, 16, stride, 9);\
1951
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1952
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1953
}\
1954
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955
    uint8_t full[16*9];\
1956
    uint8_t halfH[72];\
1957
    uint8_t halfV[64];\
1958
    uint8_t halfHV[64];\
1959
    copy_block9(full, src, 16, stride, 9);\
1960
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1961
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1962
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1964
}\
1965
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1966
    uint8_t full[16*9];\
1967
    uint8_t halfH[72];\
1968
    uint8_t halfHV[64];\
1969
    copy_block9(full, src, 16, stride, 9);\
1970
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1972
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1973
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1974
}\
1975
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t full[16*9];\
1977
    uint8_t halfH[72];\
1978
    uint8_t halfV[64];\
1979
    uint8_t halfHV[64];\
1980
    copy_block9(full, src, 16, stride, 9);\
1981
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1982
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1983
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1984
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1985
}\
1986
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1987
    uint8_t full[16*9];\
1988
    uint8_t halfH[72];\
1989
    uint8_t halfHV[64];\
1990
    copy_block9(full, src, 16, stride, 9);\
1991
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1992
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1993
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1994
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1995
}\
1996
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1997
    uint8_t full[16*9];\
1998
    uint8_t halfH[72];\
1999
    uint8_t halfV[64];\
2000
    uint8_t halfHV[64];\
2001
    copy_block9(full, src, 16, stride, 9);\
2002
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2004
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2005
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2006
}\
2007
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2008
    uint8_t full[16*9];\
2009
    uint8_t halfH[72];\
2010
    uint8_t halfHV[64];\
2011
    copy_block9(full, src, 16, stride, 9);\
2012
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2014
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2016
}\
2017
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2018
    uint8_t full[16*9];\
2019
    uint8_t halfH[72];\
2020
    uint8_t halfV[64];\
2021
    uint8_t halfHV[64];\
2022
    copy_block9(full, src, 16, stride, 9);\
2023
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
2024
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2025
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2026
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2027
}\
2028
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2029
    uint8_t full[16*9];\
2030
    uint8_t halfH[72];\
2031
    uint8_t halfHV[64];\
2032
    copy_block9(full, src, 16, stride, 9);\
2033
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2034
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2035
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2036
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2037
}\
2038
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2039
    uint8_t halfH[72];\
2040
    uint8_t halfHV[64];\
2041
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2042
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2043
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2044
}\
2045
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2046
    uint8_t halfH[72];\
2047
    uint8_t halfHV[64];\
2048
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2049
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2050
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2051
}\
2052
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2053
    uint8_t full[16*9];\
2054
    uint8_t halfH[72];\
2055
    uint8_t halfV[64];\
2056
    uint8_t halfHV[64];\
2057
    copy_block9(full, src, 16, stride, 9);\
2058
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2059
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2060
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2061
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2062
}\
2063
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2064
    uint8_t full[16*9];\
2065
    uint8_t halfH[72];\
2066
    copy_block9(full, src, 16, stride, 9);\
2067
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2068
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2069
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2070
}\
2071
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072
    uint8_t full[16*9];\
2073
    uint8_t halfH[72];\
2074
    uint8_t halfV[64];\
2075
    uint8_t halfHV[64];\
2076
    copy_block9(full, src, 16, stride, 9);\
2077
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2078
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2079
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2080
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2081
}\
2082
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2083
    uint8_t full[16*9];\
2084
    uint8_t halfH[72];\
2085
    copy_block9(full, src, 16, stride, 9);\
2086
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2087
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2088
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2089
}\
2090
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2091
    uint8_t halfH[72];\
2092
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2093
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2094
}\
2095
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2096
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2097
}\
2098
\
2099
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2100
    uint8_t half[256];\
2101
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2102
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2103
}\
2104
\
2105
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2106
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2107
}\
2108
\
2109
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2110
    uint8_t half[256];\
2111
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2112
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2113
}\
2114
\
2115
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2116
    uint8_t full[24*17];\
2117
    uint8_t half[256];\
2118
    copy_block17(full, src, 24, stride, 17);\
2119
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2120
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2121
}\
2122
\
2123
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2124
    uint8_t full[24*17];\
2125
    copy_block17(full, src, 24, stride, 17);\
2126
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2127
}\
2128
\
2129
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2130
    uint8_t full[24*17];\
2131
    uint8_t half[256];\
2132
    copy_block17(full, src, 24, stride, 17);\
2133
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2134
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2135
}\
2136
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137
    uint8_t full[24*17];\
2138
    uint8_t halfH[272];\
2139
    uint8_t halfV[256];\
2140
    uint8_t halfHV[256];\
2141
    copy_block17(full, src, 24, stride, 17);\
2142
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2143
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2144
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2146
}\
2147
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2148
    uint8_t full[24*17];\
2149
    uint8_t halfH[272];\
2150
    uint8_t halfHV[256];\
2151
    copy_block17(full, src, 24, stride, 17);\
2152
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2154
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2155
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2156
}\
2157
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2158
    uint8_t full[24*17];\
2159
    uint8_t halfH[272];\
2160
    uint8_t halfV[256];\
2161
    uint8_t halfHV[256];\
2162
    copy_block17(full, src, 24, stride, 17);\
2163
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2164
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2165
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2166
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2167
}\
2168
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2169
    uint8_t full[24*17];\
2170
    uint8_t halfH[272];\
2171
    uint8_t halfHV[256];\
2172
    copy_block17(full, src, 24, stride, 17);\
2173
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2174
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2175
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2176
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2177
}\
2178
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2179
    uint8_t full[24*17];\
2180
    uint8_t halfH[272];\
2181
    uint8_t halfV[256];\
2182
    uint8_t halfHV[256];\
2183
    copy_block17(full, src, 24, stride, 17);\
2184
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2186
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2187
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2188
}\
2189
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2190
    uint8_t full[24*17];\
2191
    uint8_t halfH[272];\
2192
    uint8_t halfHV[256];\
2193
    copy_block17(full, src, 24, stride, 17);\
2194
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2196
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2198
}\
2199
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2200
    uint8_t full[24*17];\
2201
    uint8_t halfH[272];\
2202
    uint8_t halfV[256];\
2203
    uint8_t halfHV[256];\
2204
    copy_block17(full, src, 24, stride, 17);\
2205
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2206
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2207
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2208
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2209
}\
2210
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2211
    uint8_t full[24*17];\
2212
    uint8_t halfH[272];\
2213
    uint8_t halfHV[256];\
2214
    copy_block17(full, src, 24, stride, 17);\
2215
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2216
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2217
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2218
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2219
}\
2220
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2221
    uint8_t halfH[272];\
2222
    uint8_t halfHV[256];\
2223
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2224
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2225
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2226
}\
2227
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2228
    uint8_t halfH[272];\
2229
    uint8_t halfHV[256];\
2230
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2231
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2232
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2233
}\
2234
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2235
    uint8_t full[24*17];\
2236
    uint8_t halfH[272];\
2237
    uint8_t halfV[256];\
2238
    uint8_t halfHV[256];\
2239
    copy_block17(full, src, 24, stride, 17);\
2240
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2241
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2242
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2243
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2244
}\
2245
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2246
    uint8_t full[24*17];\
2247
    uint8_t halfH[272];\
2248
    copy_block17(full, src, 24, stride, 17);\
2249
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2250
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2251
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2252
}\
2253
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2254
    uint8_t full[24*17];\
2255
    uint8_t halfH[272];\
2256
    uint8_t halfV[256];\
2257
    uint8_t halfHV[256];\
2258
    copy_block17(full, src, 24, stride, 17);\
2259
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2260
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2261
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2262
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2263
}\
2264
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2265
    uint8_t full[24*17];\
2266
    uint8_t halfH[272];\
2267
    copy_block17(full, src, 24, stride, 17);\
2268
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2269
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2270
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2271
}\
2272
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2273
    uint8_t halfH[272];\
2274
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2275
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2276
}
2277

    
2278
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2279
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2280
#define op_put(a, b) a = cm[((b) + 16)>>5]
2281
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2282

    
2283
QPEL_MC(0, put_       , _       , op_put)
2284
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2285
QPEL_MC(0, avg_       , _       , op_avg)
2286
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2287
#undef op_avg
2288
#undef op_avg_no_rnd
2289
#undef op_put
2290
#undef op_put_no_rnd
2291

    
2292
#if 1
2293
#define H264_LOWPASS(OPNAME, OP, OP2) \
2294
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295
    const int h=2;\
2296
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297
    int i;\
2298
    for(i=0; i<h; i++)\
2299
    {\
2300
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302
        dst+=dstStride;\
2303
        src+=srcStride;\
2304
    }\
2305
}\
2306
\
2307
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2308
    const int w=2;\
2309
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310
    int i;\
2311
    for(i=0; i<w; i++)\
2312
    {\
2313
        const int srcB= src[-2*srcStride];\
2314
        const int srcA= src[-1*srcStride];\
2315
        const int src0= src[0 *srcStride];\
2316
        const int src1= src[1 *srcStride];\
2317
        const int src2= src[2 *srcStride];\
2318
        const int src3= src[3 *srcStride];\
2319
        const int src4= src[4 *srcStride];\
2320
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2321
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2322
        dst++;\
2323
        src++;\
2324
    }\
2325
}\
2326
\
2327
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2328
    const int h=2;\
2329
    const int w=2;\
2330
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2331
    int i;\
2332
    src -= 2*srcStride;\
2333
    for(i=0; i<h+5; i++)\
2334
    {\
2335
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2336
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2337
        tmp+=tmpStride;\
2338
        src+=srcStride;\
2339
    }\
2340
    tmp -= tmpStride*(h+5-2);\
2341
    for(i=0; i<w; i++)\
2342
    {\
2343
        const int tmpB= tmp[-2*tmpStride];\
2344
        const int tmpA= tmp[-1*tmpStride];\
2345
        const int tmp0= tmp[0 *tmpStride];\
2346
        const int tmp1= tmp[1 *tmpStride];\
2347
        const int tmp2= tmp[2 *tmpStride];\
2348
        const int tmp3= tmp[3 *tmpStride];\
2349
        const int tmp4= tmp[4 *tmpStride];\
2350
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2351
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2352
        dst++;\
2353
        tmp++;\
2354
    }\
2355
}\
2356
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2357
    const int h=4;\
2358
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2359
    int i;\
2360
    for(i=0; i<h; i++)\
2361
    {\
2362
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2363
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2364
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2365
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2366
        dst+=dstStride;\
2367
        src+=srcStride;\
2368
    }\
2369
}\
2370
\
2371
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2372
    const int w=4;\
2373
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2374
    int i;\
2375
    for(i=0; i<w; i++)\
2376
    {\
2377
        const int srcB= src[-2*srcStride];\
2378
        const int srcA= src[-1*srcStride];\
2379
        const int src0= src[0 *srcStride];\
2380
        const int src1= src[1 *srcStride];\
2381
        const int src2= src[2 *srcStride];\
2382
        const int src3= src[3 *srcStride];\
2383
        const int src4= src[4 *srcStride];\
2384
        const int src5= src[5 *srcStride];\
2385
        const int src6= src[6 *srcStride];\
2386
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2387
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2388
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2389
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2390
        dst++;\
2391
        src++;\
2392
    }\
2393
}\
2394
\
2395
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396
    const int h=4;\
2397
    const int w=4;\
2398
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2399
    int i;\
2400
    src -= 2*srcStride;\
2401
    for(i=0; i<h+5; i++)\
2402
    {\
2403
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2404
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2405
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2406
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2407
        tmp+=tmpStride;\
2408
        src+=srcStride;\
2409
    }\
2410
    tmp -= tmpStride*(h+5-2);\
2411
    for(i=0; i<w; i++)\
2412
    {\
2413
        const int tmpB= tmp[-2*tmpStride];\
2414
        const int tmpA= tmp[-1*tmpStride];\
2415
        const int tmp0= tmp[0 *tmpStride];\
2416
        const int tmp1= tmp[1 *tmpStride];\
2417
        const int tmp2= tmp[2 *tmpStride];\
2418
        const int tmp3= tmp[3 *tmpStride];\
2419
        const int tmp4= tmp[4 *tmpStride];\
2420
        const int tmp5= tmp[5 *tmpStride];\
2421
        const int tmp6= tmp[6 *tmpStride];\
2422
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2423
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2424
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2425
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2426
        dst++;\
2427
        tmp++;\
2428
    }\
2429
}\
2430
\
2431
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2432
    const int h=8;\
2433
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2434
    int i;\
2435
    for(i=0; i<h; i++)\
2436
    {\
2437
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2438
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2439
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2440
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2441
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2442
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2443
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2444
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2445
        dst+=dstStride;\
2446
        src+=srcStride;\
2447
    }\
2448
}\
2449
\
2450
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451
    const int w=8;\
2452
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2453
    int i;\
2454
    for(i=0; i<w; i++)\
2455
    {\
2456
        const int srcB= src[-2*srcStride];\
2457
        const int srcA= src[-1*srcStride];\
2458
        const int src0= src[0 *srcStride];\
2459
        const int src1= src[1 *srcStride];\
2460
        const int src2= src[2 *srcStride];\
2461
        const int src3= src[3 *srcStride];\
2462
        const int src4= src[4 *srcStride];\
2463
        const int src5= src[5 *srcStride];\
2464
        const int src6= src[6 *srcStride];\
2465
        const int src7= src[7 *srcStride];\
2466
        const int src8= src[8 *srcStride];\
2467
        const int src9= src[9 *srcStride];\
2468
        const int src10=src[10*srcStride];\
2469
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2470
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2471
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2472
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2473
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2474
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2475
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2476
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2477
        dst++;\
2478
        src++;\
2479
    }\
2480
}\
2481
\
2482
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2483
    const int h=8;\
2484
    const int w=8;\
2485
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2486
    int i;\
2487
    src -= 2*srcStride;\
2488
    for(i=0; i<h+5; i++)\
2489
    {\
2490
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2491
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2492
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2493
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2494
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2495
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2496
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2497
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2498
        tmp+=tmpStride;\
2499
        src+=srcStride;\
2500
    }\
2501
    tmp -= tmpStride*(h+5-2);\
2502
    for(i=0; i<w; i++)\
2503
    {\
2504
        const int tmpB= tmp[-2*tmpStride];\
2505
        const int tmpA= tmp[-1*tmpStride];\
2506
        const int tmp0= tmp[0 *tmpStride];\
2507
        const int tmp1= tmp[1 *tmpStride];\
2508
        const int tmp2= tmp[2 *tmpStride];\
2509
        const int tmp3= tmp[3 *tmpStride];\
2510
        const int tmp4= tmp[4 *tmpStride];\
2511
        const int tmp5= tmp[5 *tmpStride];\
2512
        const int tmp6= tmp[6 *tmpStride];\
2513
        const int tmp7= tmp[7 *tmpStride];\
2514
        const int tmp8= tmp[8 *tmpStride];\
2515
        const int tmp9= tmp[9 *tmpStride];\
2516
        const int tmp10=tmp[10*tmpStride];\
2517
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2518
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2519
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2520
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2521
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2522
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2523
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2524
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2525
        dst++;\
2526
        tmp++;\
2527
    }\
2528
}\
2529
\
2530
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2531
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2532
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2533
    src += 8*srcStride;\
2534
    dst += 8*dstStride;\
2535
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2536
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2537
}\
2538
\
2539
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2540
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2541
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2542
    src += 8*srcStride;\
2543
    dst += 8*dstStride;\
2544
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2545
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2546
}\
2547
\
2548
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2549
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2550
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2551
    src += 8*srcStride;\
2552
    dst += 8*dstStride;\
2553
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2554
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2555
}\
2556

    
2557
#define H264_MC(OPNAME, SIZE) \
2558
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2559
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2560
}\
2561
\
2562
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2563
    uint8_t half[SIZE*SIZE];\
2564
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2565
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2566
}\
2567
\
2568
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2569
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2570
}\
2571
\
2572
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2573
    uint8_t half[SIZE*SIZE];\
2574
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2575
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2576
}\
2577
\
2578
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2579
    uint8_t full[SIZE*(SIZE+5)];\
2580
    uint8_t * const full_mid= full + SIZE*2;\
2581
    uint8_t half[SIZE*SIZE];\
2582
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2583
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2584
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2585
}\
2586
\
2587
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2588
    uint8_t full[SIZE*(SIZE+5)];\
2589
    uint8_t * const full_mid= full + SIZE*2;\
2590
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2591
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2592
}\
2593
\
2594
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2595
    uint8_t full[SIZE*(SIZE+5)];\
2596
    uint8_t * const full_mid= full + SIZE*2;\
2597
    uint8_t half[SIZE*SIZE];\
2598
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2599
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2600
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2601
}\
2602
\
2603
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2604
    uint8_t full[SIZE*(SIZE+5)];\
2605
    uint8_t * const full_mid= full + SIZE*2;\
2606
    uint8_t halfH[SIZE*SIZE];\
2607
    uint8_t halfV[SIZE*SIZE];\
2608
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2609
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2610
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2611
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2612
}\
2613
\
2614
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2615
    uint8_t full[SIZE*(SIZE+5)];\
2616
    uint8_t * const full_mid= full + SIZE*2;\
2617
    uint8_t halfH[SIZE*SIZE];\
2618
    uint8_t halfV[SIZE*SIZE];\
2619
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2620
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2621
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2622
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2623
}\
2624
\
2625
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2626
    uint8_t full[SIZE*(SIZE+5)];\
2627
    uint8_t * const full_mid= full + SIZE*2;\
2628
    uint8_t halfH[SIZE*SIZE];\
2629
    uint8_t halfV[SIZE*SIZE];\
2630
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2631
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2632
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2633
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2634
}\
2635
\
2636
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2637
    uint8_t full[SIZE*(SIZE+5)];\
2638
    uint8_t * const full_mid= full + SIZE*2;\
2639
    uint8_t halfH[SIZE*SIZE];\
2640
    uint8_t halfV[SIZE*SIZE];\
2641
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2642
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2643
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2644
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2645
}\
2646
\
2647
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2648
    int16_t tmp[SIZE*(SIZE+5)];\
2649
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2650
}\
2651
\
2652
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2653
    int16_t tmp[SIZE*(SIZE+5)];\
2654
    uint8_t halfH[SIZE*SIZE];\
2655
    uint8_t halfHV[SIZE*SIZE];\
2656
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2657
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2658
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2659
}\
2660
\
2661
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2662
    int16_t tmp[SIZE*(SIZE+5)];\
2663
    uint8_t halfH[SIZE*SIZE];\
2664
    uint8_t halfHV[SIZE*SIZE];\
2665
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2666
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2667
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2668
}\
2669
\
2670
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2671
    uint8_t full[SIZE*(SIZE+5)];\
2672
    uint8_t * const full_mid= full + SIZE*2;\
2673
    int16_t tmp[SIZE*(SIZE+5)];\
2674
    uint8_t halfV[SIZE*SIZE];\
2675
    uint8_t halfHV[SIZE*SIZE];\
2676
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2677
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2678
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2679
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2680
}\
2681
\
2682
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2683
    uint8_t full[SIZE*(SIZE+5)];\
2684
    uint8_t * const full_mid= full + SIZE*2;\
2685
    int16_t tmp[SIZE*(SIZE+5)];\
2686
    uint8_t halfV[SIZE*SIZE];\
2687
    uint8_t halfHV[SIZE*SIZE];\
2688
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2689
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2690
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2691
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2692
}\
2693

    
2694
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2695
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2696
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2697
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2698
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2699

    
2700
H264_LOWPASS(put_       , op_put, op2_put)
2701
H264_LOWPASS(avg_       , op_avg, op2_avg)
2702
H264_MC(put_, 2)
2703
H264_MC(put_, 4)
2704
H264_MC(put_, 8)
2705
H264_MC(put_, 16)
2706
H264_MC(avg_, 4)
2707
H264_MC(avg_, 8)
2708
H264_MC(avg_, 16)
2709

    
2710
#undef op_avg
2711
#undef op_put
2712
#undef op2_avg
2713
#undef op2_put
2714
#endif
2715

    
2716
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2717
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2718
#define H264_WEIGHT(W,H) \
2719
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2720
    int y; \
2721
    offset <<= log2_denom; \
2722
    if(log2_denom) offset += 1<<(log2_denom-1); \
2723
    for(y=0; y<H; y++, block += stride){ \
2724
        op_scale1(0); \
2725
        op_scale1(1); \
2726
        if(W==2) continue; \
2727
        op_scale1(2); \
2728
        op_scale1(3); \
2729
        if(W==4) continue; \
2730
        op_scale1(4); \
2731
        op_scale1(5); \
2732
        op_scale1(6); \
2733
        op_scale1(7); \
2734
        if(W==8) continue; \
2735
        op_scale1(8); \
2736
        op_scale1(9); \
2737
        op_scale1(10); \
2738
        op_scale1(11); \
2739
        op_scale1(12); \
2740
        op_scale1(13); \
2741
        op_scale1(14); \
2742
        op_scale1(15); \
2743
    } \
2744
} \
2745
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2746
    int y; \
2747
    offset = ((offset + 1) | 1) << log2_denom; \
2748
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2749
        op_scale2(0); \
2750
        op_scale2(1); \
2751
        if(W==2) continue; \
2752
        op_scale2(2); \
2753
        op_scale2(3); \
2754
        if(W==4) continue; \
2755
        op_scale2(4); \
2756
        op_scale2(5); \
2757
        op_scale2(6); \
2758
        op_scale2(7); \
2759
        if(W==8) continue; \
2760
        op_scale2(8); \
2761
        op_scale2(9); \
2762
        op_scale2(10); \
2763
        op_scale2(11); \
2764
        op_scale2(12); \
2765
        op_scale2(13); \
2766
        op_scale2(14); \
2767
        op_scale2(15); \
2768
    } \
2769
}
2770

    
2771
H264_WEIGHT(16,16)
2772
H264_WEIGHT(16,8)
2773
H264_WEIGHT(8,16)
2774
H264_WEIGHT(8,8)
2775
H264_WEIGHT(8,4)
2776
H264_WEIGHT(4,8)
2777
H264_WEIGHT(4,4)
2778
H264_WEIGHT(4,2)
2779
H264_WEIGHT(2,4)
2780
H264_WEIGHT(2,2)
2781

    
2782
#undef op_scale1
2783
#undef op_scale2
2784
#undef H264_WEIGHT
2785

    
2786
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2787
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2788
    int i;
2789

    
2790
    for(i=0; i<h; i++){
2791
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2792
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2793
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2794
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2795
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2796
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2797
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2798
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2799
        dst+=dstStride;
2800
        src+=srcStride;
2801
    }
2802
}
2803

    
2804
#if CONFIG_CAVS_DECODER
2805
/* AVS specific */
2806
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2807

    
2808
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2809
    put_pixels8_c(dst, src, stride, 8);
2810
}
2811
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2812
    avg_pixels8_c(dst, src, stride, 8);
2813
}
2814
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2815
    put_pixels16_c(dst, src, stride, 16);
2816
}
2817
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2818
    avg_pixels16_c(dst, src, stride, 16);
2819
}
2820
#endif /* CONFIG_CAVS_DECODER */
2821

    
2822
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2823

    
2824
#if CONFIG_VC1_DECODER
2825
/* VC-1 specific */
2826
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2827

    
2828
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2829
    put_pixels8_c(dst, src, stride, 8);
2830
}
2831
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2832
    avg_pixels8_c(dst, src, stride, 8);
2833
}
2834
#endif /* CONFIG_VC1_DECODER */
2835

    
2836
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2837

    
2838
/* H264 specific */
2839
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2840

    
2841
#if CONFIG_RV30_DECODER
2842
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2843
#endif /* CONFIG_RV30_DECODER */
2844

    
2845
#if CONFIG_RV40_DECODER
2846
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2847
    put_pixels16_xy2_c(dst, src, stride, 16);
2848
}
2849
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2850
    avg_pixels16_xy2_c(dst, src, stride, 16);
2851
}
2852
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2853
    put_pixels8_xy2_c(dst, src, stride, 8);
2854
}
2855
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2856
    avg_pixels8_xy2_c(dst, src, stride, 8);
2857
}
2858

    
2859
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2860
#endif /* CONFIG_RV40_DECODER */
2861

    
2862
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2863
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2864
    int i;
2865

    
2866
    for(i=0; i<w; i++){
2867
        const int src_1= src[ -srcStride];
2868
        const int src0 = src[0          ];
2869
        const int src1 = src[  srcStride];
2870
        const int src2 = src[2*srcStride];
2871
        const int src3 = src[3*srcStride];
2872
        const int src4 = src[4*srcStride];
2873
        const int src5 = src[5*srcStride];
2874
        const int src6 = src[6*srcStride];
2875
        const int src7 = src[7*srcStride];
2876
        const int src8 = src[8*srcStride];
2877
        const int src9 = src[9*srcStride];
2878
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2879
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2880
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2881
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2882
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2883
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2884
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2885
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2886
        src++;
2887
        dst++;
2888
    }
2889
}
2890

    
2891
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2892
    put_pixels8_c(dst, src, stride, 8);
2893
}
2894

    
2895
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2896
    uint8_t half[64];
2897
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2898
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2899
}
2900

    
2901
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2902
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2903
}
2904

    
2905
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2906
    uint8_t half[64];
2907
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2908
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2909
}
2910

    
2911
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2912
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2913
}
2914

    
2915
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2916
    uint8_t halfH[88];
2917
    uint8_t halfV[64];
2918
    uint8_t halfHV[64];
2919
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2920
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2921
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2922
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2923
}
2924
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2925
    uint8_t halfH[88];
2926
    uint8_t halfV[64];
2927
    uint8_t halfHV[64];
2928
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2929
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2930
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2931
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2932
}
2933
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2934
    uint8_t halfH[88];
2935
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2936
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2937
}
2938

    
2939
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2940
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2941
    int x;
2942
    const int strength= ff_h263_loop_filter_strength[qscale];
2943

    
2944
    for(x=0; x<8; x++){
2945
        int d1, d2, ad1;
2946
        int p0= src[x-2*stride];
2947
        int p1= src[x-1*stride];
2948
        int p2= src[x+0*stride];
2949
        int p3= src[x+1*stride];
2950
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2951

    
2952
        if     (d<-2*strength) d1= 0;
2953
        else if(d<-  strength) d1=-2*strength - d;
2954
        else if(d<   strength) d1= d;
2955
        else if(d< 2*strength) d1= 2*strength - d;
2956
        else                   d1= 0;
2957

    
2958
        p1 += d1;
2959
        p2 -= d1;
2960
        if(p1&256) p1= ~(p1>>31);
2961
        if(p2&256) p2= ~(p2>>31);
2962

    
2963
        src[x-1*stride] = p1;
2964
        src[x+0*stride] = p2;
2965

    
2966
        ad1= FFABS(d1)>>1;
2967

    
2968
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2969

    
2970
        src[x-2*stride] = p0 - d2;
2971
        src[x+  stride] = p3 + d2;
2972
    }
2973
    }
2974
}
2975

    
2976
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2977
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2978
    int y;
2979
    const int strength= ff_h263_loop_filter_strength[qscale];
2980

    
2981
    for(y=0; y<8; y++){
2982
        int d1, d2, ad1;
2983
        int p0= src[y*stride-2];
2984
        int p1= src[y*stride-1];
2985
        int p2= src[y*stride+0];
2986
        int p3= src[y*stride+1];
2987
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2988

    
2989
        if     (d<-2*strength) d1= 0;
2990
        else if(d<-  strength) d1=-2*strength - d;
2991
        else if(d<   strength) d1= d;
2992
        else if(d< 2*strength) d1= 2*strength - d;
2993
        else                   d1= 0;
2994

    
2995
        p1 += d1;
2996
        p2 -= d1;
2997
        if(p1&256) p1= ~(p1>>31);
2998
        if(p2&256) p2= ~(p2>>31);
2999

    
3000
        src[y*stride-1] = p1;
3001
        src[y*stride+0] = p2;
3002

    
3003
        ad1= FFABS(d1)>>1;
3004

    
3005
        d2= av_clip((p0-p3)/4, -ad1, ad1);
3006

    
3007
        src[y*stride-2] = p0 - d2;
3008
        src[y*stride+1] = p3 + d2;
3009
    }
3010
    }
3011
}
3012

    
3013
static void h261_loop_filter_c(uint8_t *src, int stride){
3014
    int x,y,xy,yz;
3015
    int temp[64];
3016

    
3017
    for(x=0; x<8; x++){
3018
        temp[x      ] = 4*src[x           ];
3019
        temp[x + 7*8] = 4*src[x + 7*stride];
3020
    }
3021
    for(y=1; y<7; y++){
3022
        for(x=0; x<8; x++){
3023
            xy = y * stride + x;
3024
            yz = y * 8 + x;
3025
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
3026
        }
3027
    }
3028

    
3029
    for(y=0; y<8; y++){
3030
        src[  y*stride] = (temp[  y*8] + 2)>>2;
3031
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3032
        for(x=1; x<7; x++){
3033
            xy = y * stride + x;
3034
            yz = y * 8 + x;
3035
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3036
        }
3037
    }
3038
}
3039

    
3040
static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3041
{
3042
    int i, d;
3043
    for( i = 0; i < 4; i++ ) {
3044
        if( tc0[i] < 0 ) {
3045
            pix += 4*ystride;
3046
            continue;
3047
        }
3048
        for( d = 0; d < 4; d++ ) {
3049
            const int p0 = pix[-1*xstride];
3050
            const int p1 = pix[-2*xstride];
3051
            const int p2 = pix[-3*xstride];
3052
            const int q0 = pix[0];
3053
            const int q1 = pix[1*xstride];
3054
            const int q2 = pix[2*xstride];
3055

    
3056
            if( FFABS( p0 - q0 ) < alpha &&
3057
                FFABS( p1 - p0 ) < beta &&
3058
                FFABS( q1 - q0 ) < beta ) {
3059

    
3060
                int tc = tc0[i];
3061
                int i_delta;
3062

    
3063
                if( FFABS( p2 - p0 ) < beta ) {
3064
                    if(tc0[i])
3065
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3066
                    tc++;
3067
                }
3068
                if( FFABS( q2 - q0 ) < beta ) {
3069
                    if(tc0[i])
3070
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3071
                    tc++;
3072
                }
3073

    
3074
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3075
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3076
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3077
            }
3078
            pix += ystride;
3079
        }
3080
    }
3081
}
3082
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3083
{
3084
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3085
}
3086
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3087
{
3088
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3089
}
3090

    
3091
static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3092
{
3093
    int d;
3094
    for( d = 0; d < 16; d++ ) {
3095
        const int p2 = pix[-3*xstride];
3096
        const int p1 = pix[-2*xstride];
3097
        const int p0 = pix[-1*xstride];
3098

    
3099
        const int q0 = pix[ 0*xstride];
3100
        const int q1 = pix[ 1*xstride];
3101
        const int q2 = pix[ 2*xstride];
3102

    
3103
        if( FFABS( p0 - q0 ) < alpha &&
3104
            FFABS( p1 - p0 ) < beta &&
3105
            FFABS( q1 - q0 ) < beta ) {
3106

    
3107
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3108
                if( FFABS( p2 - p0 ) < beta)
3109
                {
3110
                    const int p3 = pix[-4*xstride];
3111
                    /* p0', p1', p2' */
3112
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3113
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3114
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3115
                } else {
3116
                    /* p0' */
3117
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3118
                }
3119
                if( FFABS( q2 - q0 ) < beta)
3120
                {
3121
                    const int q3 = pix[3*xstride];
3122
                    /* q0', q1', q2' */
3123
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3124
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3125
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3126
                } else {
3127
                    /* q0' */
3128
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3129
                }
3130
            }else{
3131
                /* p0', q0' */
3132
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3133
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3134
            }
3135
        }
3136
        pix += ystride;
3137
    }
3138
}
3139
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3140
{
3141
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3142
}
3143
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3144
{
3145
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3146
}
3147

    
3148
static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3149
{
3150
    int i, d;
3151
    for( i = 0; i < 4; i++ ) {
3152
        const int tc = tc0[i];
3153
        if( tc <= 0 ) {
3154
            pix += 2*ystride;
3155
            continue;
3156
        }
3157
        for( d = 0; d < 2; d++ ) {
3158
            const int p0 = pix[-1*xstride];
3159
            const int p1 = pix[-2*xstride];
3160
            const int q0 = pix[0];
3161
            const int q1 = pix[1*xstride];
3162

    
3163
            if( FFABS( p0 - q0 ) < alpha &&
3164
                FFABS( p1 - p0 ) < beta &&
3165
                FFABS( q1 - q0 ) < beta ) {
3166

    
3167
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3168

    
3169
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3170
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3171
            }
3172
            pix += ystride;
3173
        }
3174
    }
3175
}
3176
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3177
{
3178
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3179
}
3180
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3181
{
3182
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3183
}
3184

    
3185
static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3186
{
3187
    int d;
3188
    for( d = 0; d < 8; d++ ) {
3189
        const int p0 = pix[-1*xstride];
3190
        const int p1 = pix[-2*xstride];
3191
        const int q0 = pix[0];
3192
        const int q1 = pix[1*xstride];
3193

    
3194
        if( FFABS( p0 - q0 ) < alpha &&
3195
            FFABS( p1 - p0 ) < beta &&
3196
            FFABS( q1 - q0 ) < beta ) {
3197

    
3198
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3199
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3200
        }
3201
        pix += ystride;
3202
    }
3203
}
3204
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3205
{
3206
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3207
}
3208
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3209
{
3210
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3211
}
3212

    
3213
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3214
{
3215
    int s, i;
3216

    
3217
    s = 0;
3218
    for(i=0;i<h;i++) {
3219
        s += abs(pix1[0] - pix2[0]);
3220
        s += abs(pix1[1] - pix2[1]);
3221
        s += abs(pix1[2] - pix2[2]);
3222
        s += abs(pix1[3] - pix2[3]);
3223
        s += abs(pix1[4] - pix2[4]);
3224
        s += abs(pix1[5] - pix2[5]);
3225
        s += abs(pix1[6] - pix2[6]);
3226
        s += abs(pix1[7] - pix2[7]);
3227
        s += abs(pix1[8] - pix2[8]);
3228
        s += abs(pix1[9] - pix2[9]);
3229
        s += abs(pix1[10] - pix2[10]);
3230
        s += abs(pix1[11] - pix2[11]);
3231
        s += abs(pix1[12] - pix2[12]);
3232
        s += abs(pix1[13] - pix2[13]);
3233
        s += abs(pix1[14] - pix2[14]);
3234
        s += abs(pix1[15] - pix2[15]);
3235
        pix1 += line_size;
3236
        pix2 += line_size;
3237
    }
3238
    return s;
3239
}
3240

    
3241
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3242
{
3243
    int s, i;
3244

    
3245
    s = 0;
3246
    for(i=0;i<h;i++) {
3247
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3248
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3249
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3250
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3251
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3252
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3253
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3254
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3255
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3256
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3257
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3258
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3259
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3260
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3261
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3262
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3263
        pix1 += line_size;
3264
        pix2 += line_size;
3265
    }
3266
    return s;
3267
}
3268

    
3269
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3270
{
3271
    int s, i;
3272
    uint8_t *pix3 = pix2 + line_size;
3273

    
3274
    s = 0;
3275
    for(i=0;i<h;i++) {
3276
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3277
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3278
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3279
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3280
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3281
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3282
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3283
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3284
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3285
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3286
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3287
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3288
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3289
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3290
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3291
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3292
        pix1 += line_size;
3293
        pix2 += line_size;
3294
        pix3 += line_size;
3295
    }
3296
    return s;
3297
}
3298

    
3299
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3300
{
3301
    int s, i;
3302
    uint8_t *pix3 = pix2 + line_size;
3303

    
3304
    s = 0;
3305
    for(i=0;i<h;i++) {
3306
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3307
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3308
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3309
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3310
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3311
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3312
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3313
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3314
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3315
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3316
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3317
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3318
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3319
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3320
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3321
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3322
        pix1 += line_size;
3323
        pix2 += line_size;
3324
        pix3 += line_size;
3325
    }
3326
    return s;
3327
}
3328

    
3329
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3330
{
3331
    int s, i;
3332

    
3333
    s = 0;
3334
    for(i=0;i<h;i++) {
3335
        s += abs(pix1[0] - pix2[0]);
3336
        s += abs(pix1[1] - pix2[1]);
3337
        s += abs(pix1[2] - pix2[2]);
3338
        s += abs(pix1[3] - pix2[3]);
3339
        s += abs(pix1[4] - pix2[4]);
3340
        s += abs(pix1[5] - pix2[5]);
3341
        s += abs(pix1[6] - pix2[6]);
3342
        s += abs(pix1[7] - pix2[7]);
3343
        pix1 += line_size;
3344
        pix2 += line_size;
3345
    }
3346
    return s;
3347
}
3348

    
3349
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3350
{
3351
    int s, i;
3352

    
3353
    s = 0;
3354
    for(i=0;i<h;i++) {
3355
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3356
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3357
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3358
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3359
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3360
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3361
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3362
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3363
        pix1 += line_size;
3364
        pix2 += line_size;
3365
    }
3366
    return s;
3367
}
3368

    
3369
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3370
{
3371
    int s, i;
3372
    uint8_t *pix3 = pix2 + line_size;
3373

    
3374
    s = 0;
3375
    for(i=0;i<h;i++) {
3376
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3377
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3378
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3379
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3380
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3381
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3382
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3383
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3384
        pix1 += line_size;
3385
        pix2 += line_size;
3386
        pix3 += line_size;
3387
    }
3388
    return s;
3389
}
3390

    
3391
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3392
{
3393
    int s, i;
3394
    uint8_t *pix3 = pix2 + line_size;
3395

    
3396
    s = 0;
3397
    for(i=0;i<h;i++) {
3398
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3399
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3400
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3401
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3402
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3403
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3404
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3405
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3406
        pix1 += line_size;
3407
        pix2 += line_size;
3408
        pix3 += line_size;
3409
    }
3410
    return s;
3411
}
3412

    
3413
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3414
    MpegEncContext *c = v;
3415
    int score1=0;
3416
    int score2=0;
3417
    int x,y;
3418

    
3419
    for(y=0; y<h; y++){
3420
        for(x=0; x<16; x++){
3421
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3422
        }
3423
        if(y+1<h){
3424
            for(x=0; x<15; x++){
3425
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3426
                             - s1[x+1] + s1[x+1+stride])
3427
                        -FFABS(  s2[x  ] - s2[x  +stride]
3428
                             - s2[x+1] + s2[x+1+stride]);
3429
            }
3430
        }
3431
        s1+= stride;
3432
        s2+= stride;
3433
    }
3434

    
3435
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3436
    else  return score1 + FFABS(score2)*8;
3437
}
3438

    
3439
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3440
    MpegEncContext *c = v;
3441
    int score1=0;
3442
    int score2=0;
3443
    int x,y;
3444

    
3445
    for(y=0; y<h; y++){
3446
        for(x=0; x<8; x++){
3447
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3448
        }
3449
        if(y+1<h){
3450
            for(x=0; x<7; x++){
3451
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3452
                             - s1[x+1] + s1[x+1+stride])
3453
                        -FFABS(  s2[x  ] - s2[x  +stride]
3454
                             - s2[x+1] + s2[x+1+stride]);
3455
            }
3456
        }
3457
        s1+= stride;
3458
        s2+= stride;
3459
    }
3460

    
3461
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3462
    else  return score1 + FFABS(score2)*8;
3463
}
3464

    
3465
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3466
    int i;
3467
    unsigned int sum=0;
3468

    
3469
    for(i=0; i<8*8; i++){
3470
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3471
        int w= weight[i];
3472
        b>>= RECON_SHIFT;
3473
        assert(-512<b && b<512);
3474

    
3475
        sum += (w*b)*(w*b)>>4;
3476
    }
3477
    return sum>>2;
3478
}
3479

    
3480
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3481
    int i;
3482

    
3483
    for(i=0; i<8*8; i++){
3484
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3485
    }
3486
}
3487

    
3488
/**
3489
 * permutes an 8x8 block.
3490
 * @param block the block which will be permuted according to the given permutation vector
3491
 * @param permutation the permutation vector
3492
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3493
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3494
 *                  (inverse) permutated to scantable order!
3495
 */
3496
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3497
{
3498
    int i;
3499
    DCTELEM temp[64];
3500

    
3501
    if(last<=0) return;
3502
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3503

    
3504
    for(i=0; i<=last; i++){
3505
        const int j= scantable[i];
3506
        temp[j]= block[j];
3507
        block[j]=0;
3508
    }
3509

    
3510
    for(i=0; i<=last; i++){
3511
        const int j= scantable[i];
3512
        const int perm_j= permutation[j];
3513
        block[perm_j]= temp[j];
3514
    }
3515
}
3516

    
3517
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3518
    return 0;
3519
}
3520

    
3521
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3522
    int i;
3523

    
3524
    memset(cmp, 0, sizeof(void*)*6);
3525

    
3526
    for(i=0; i<6; i++){
3527
        switch(type&0xFF){
3528
        case FF_CMP_SAD:
3529
            cmp[i]= c->sad[i];
3530
            break;
3531
        case FF_CMP_SATD:
3532
            cmp[i]= c->hadamard8_diff[i];
3533
            break;
3534
        case FF_CMP_SSE:
3535
            cmp[i]= c->sse[i];
3536
            break;
3537
        case FF_CMP_DCT:
3538
            cmp[i]= c->dct_sad[i];
3539
            break;
3540
        case FF_CMP_DCT264:
3541
            cmp[i]= c->dct264_sad[i];
3542
            break;
3543
        case FF_CMP_DCTMAX:
3544
            cmp[i]= c->dct_max[i];
3545
            break;
3546
        case FF_CMP_PSNR:
3547
            cmp[i]= c->quant_psnr[i];
3548
            break;
3549
        case FF_CMP_BIT:
3550
            cmp[i]= c->bit[i];
3551
            break;
3552
        case FF_CMP_RD:
3553
            cmp[i]= c->rd[i];
3554
            break;
3555
        case FF_CMP_VSAD:
3556
            cmp[i]= c->vsad[i];
3557
            break;
3558
        case FF_CMP_VSSE:
3559
            cmp[i]= c->vsse[i];
3560
            break;
3561
        case FF_CMP_ZERO:
3562
            cmp[i]= zero_cmp;
3563
            break;
3564
        case FF_CMP_NSSE:
3565
            cmp[i]= c->nsse[i];
3566
            break;
3567
#if CONFIG_SNOW_ENCODER
3568
        case FF_CMP_W53:
3569
            cmp[i]= c->w53[i];
3570
            break;
3571
        case FF_CMP_W97:
3572
            cmp[i]= c->w97[i];
3573
            break;
3574
#endif
3575
        default:
3576
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3577
        }
3578
    }
3579
}
3580

    
3581
static void clear_block_c(DCTELEM *block)
3582
{
3583
    memset(block, 0, sizeof(DCTELEM)*64);
3584
}
3585

    
3586
/**
3587
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3588
 */
3589
static void clear_blocks_c(DCTELEM *blocks)
3590
{
3591
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3592
}
3593

    
3594
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3595
    long i;
3596
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3597
        long a = *(long*)(src+i);
3598
        long b = *(long*)(dst+i);
3599
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3600
    }
3601
    for(; i<w; i++)
3602
        dst[i+0] += src[i+0];
3603
}
3604

    
3605
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3606
    long i;
3607
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3608
        long a = *(long*)(src1+i);
3609
        long b = *(long*)(src2+i);
3610
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3611
    }
3612
    for(; i<w; i++)
3613
        dst[i] = src1[i]+src2[i];
3614
}
3615

    
3616
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3617
    long i;
3618
#if !HAVE_FAST_UNALIGNED
3619
    if((long)src2 & (sizeof(long)-1)){
3620
        for(i=0; i+7<w; i+=8){
3621
            dst[i+0] = src1[i+0]-src2[i+0];
3622
            dst[i+1] = src1[i+1]-src2[i+1];
3623
            dst[i+2] = src1[i+2]-src2[i+2];
3624
            dst[i+3] = src1[i+3]-src2[i+3];
3625
            dst[i+4] = src1[i+4]-src2[i+4];
3626
            dst[i+5] = src1[i+5]-src2[i+5];
3627
            dst[i+6] = src1[i+6]-src2[i+6];
3628
            dst[i+7] = src1[i+7]-src2[i+7];
3629
        }
3630
    }else
3631
#endif
3632
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3633
        long a = *(long*)(src1+i);
3634
        long b = *(long*)(src2+i);
3635
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3636
    }
3637
    for(; i<w; i++)
3638
        dst[i+0] = src1[i+0]-src2[i+0];
3639
}
3640

    
3641
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3642
    int i;
3643
    uint8_t l, lt;
3644

    
3645
    l= *left;
3646
    lt= *left_top;
3647

    
3648
    for(i=0; i<w; i++){
3649
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3650
        lt= src1[i];
3651
        dst[i]= l;
3652
    }
3653

    
3654
    *left= l;
3655
    *left_top= lt;
3656
}
3657

    
3658
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3659
    int i;
3660
    uint8_t l, lt;
3661

    
3662
    l= *left;
3663
    lt= *left_top;
3664

    
3665
    for(i=0; i<w; i++){
3666
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3667
        lt= src1[i];
3668
        l= src2[i];
3669
        dst[i]= l - pred;
3670
    }
3671

    
3672
    *left= l;
3673
    *left_top= lt;
3674
}
3675

    
3676
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3677
    int i;
3678

    
3679
    for(i=0; i<w-1; i++){
3680
        acc+= src[i];
3681
        dst[i]= acc;
3682
        i++;
3683
        acc+= src[i];
3684
        dst[i]= acc;
3685
    }
3686

    
3687
    for(; i<w; i++){
3688
        acc+= src[i];
3689
        dst[i]= acc;
3690
    }
3691

    
3692
    return acc;
3693
}
3694

    
3695
#if HAVE_BIGENDIAN
3696
#define B 3
3697
#define G 2
3698
#define R 1
3699
#define A 0
3700
#else
3701
#define B 0
3702
#define G 1
3703
#define R 2
3704
#define A 3
3705
#endif
3706
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3707
    int i;
3708
    int r,g,b,a;
3709
    r= *red;
3710
    g= *green;
3711
    b= *blue;
3712
    a= *alpha;
3713

    
3714
    for(i=0; i<w; i++){
3715
        b+= src[4*i+B];
3716
        g+= src[4*i+G];
3717
        r+= src[4*i+R];
3718
        a+= src[4*i+A];
3719

    
3720
        dst[4*i+B]= b;
3721
        dst[4*i+G]= g;
3722
        dst[4*i+R]= r;
3723
        dst[4*i+A]= a;
3724
    }
3725

    
3726
    *red= r;
3727
    *green= g;
3728
    *blue= b;
3729
    *alpha= a;
3730
}
3731
#undef B
3732
#undef G
3733
#undef R
3734
#undef A
3735

    
3736
#define BUTTERFLY2(o1,o2,i1,i2) \
3737
o1= (i1)+(i2);\
3738
o2= (i1)-(i2);
3739

    
3740
#define BUTTERFLY1(x,y) \
3741
{\
3742
    int a,b;\
3743
    a= x;\
3744
    b= y;\
3745
    x= a+b;\
3746
    y= a-b;\
3747
}
3748

    
3749
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3750

    
3751
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3752
    int i;
3753
    int temp[64];
3754
    int sum=0;
3755

    
3756
    assert(h==8);
3757

    
3758
    for(i=0; i<8; i++){
3759
        //FIXME try pointer walks
3760
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3761
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3762
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3763
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3764

    
3765
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3766
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3767
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3768
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3769

    
3770
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3771
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3772
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3773
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3774
    }
3775

    
3776
    for(i=0; i<8; i++){
3777
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3778
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3779
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3780
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3781

    
3782
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3783
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3784
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3785
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3786

    
3787
        sum +=
3788
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3789
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3790
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3791
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3792
    }
3793
#if 0
3794
static int maxi=0;
3795
if(sum>maxi){
3796
    maxi=sum;
3797
    printf("MAX:%d\n", maxi);
3798
}
3799
#endif
3800
    return sum;
3801
}
3802

    
3803
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3804
    int i;
3805
    int temp[64];
3806
    int sum=0;
3807

    
3808
    assert(h==8);
3809

    
3810
    for(i=0; i<8; i++){
3811
        //FIXME try pointer walks
3812
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3813
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3814
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3815
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3816

    
3817
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3818
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3819
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3820
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3821

    
3822
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3823
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3824
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3825
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3826
    }
3827

    
3828
    for(i=0; i<8; i++){
3829
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3830
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3831
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3832
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3833

    
3834
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3835
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3836
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3837
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3838

    
3839
        sum +=
3840
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3841
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3842
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3843
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3844
    }
3845

    
3846
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3847

    
3848
    return sum;
3849
}
3850

    
3851
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3852
    MpegEncContext * const s= (MpegEncContext *)c;
3853
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3854

    
3855
    assert(h==8);
3856

    
3857
    s->dsp.diff_pixels(temp, src1, src2, stride);
3858
    s->dsp.fdct(temp);
3859
    return s->dsp.sum_abs_dctelem(temp);
3860
}
3861

    
3862
#if CONFIG_GPL
3863
#define DCT8_1D {\
3864
    const int s07 = SRC(0) + SRC(7);\
3865
    const int s16 = SRC(1) + SRC(6);\
3866
    const int s25 = SRC(2) + SRC(5);\
3867
    const int s34 = SRC(3) + SRC(4);\
3868
    const int a0 = s07 + s34;\
3869
    const int a1 = s16 + s25;\
3870
    const int a2 = s07 - s34;\
3871
    const int a3 = s16 - s25;\
3872
    const int d07 = SRC(0) - SRC(7);\
3873
    const int d16 = SRC(1) - SRC(6);\
3874
    const int d25 = SRC(2) - SRC(5);\
3875
    const int d34 = SRC(3) - SRC(4);\
3876
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3877
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3878
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3879
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3880
    DST(0,  a0 + a1     ) ;\
3881
    DST(1,  a4 + (a7>>2)) ;\
3882
    DST(2,  a2 + (a3>>1)) ;\
3883
    DST(3,  a5 + (a6>>2)) ;\
3884
    DST(4,  a0 - a1     ) ;\
3885
    DST(5,  a6 - (a5>>2)) ;\
3886
    DST(6, (a2>>1) - a3 ) ;\
3887
    DST(7, (a4>>2) - a7 ) ;\
3888
}
3889

    
3890
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3891
    MpegEncContext * const s= (MpegEncContext *)c;
3892
    DCTELEM dct[8][8];
3893
    int i;
3894
    int sum=0;
3895

    
3896
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3897

    
3898
#define SRC(x) dct[i][x]
3899
#define DST(x,v) dct[i][x]= v
3900
    for( i = 0; i < 8; i++ )
3901
        DCT8_1D
3902
#undef SRC
3903
#undef DST
3904

    
3905
#define SRC(x) dct[x][i]
3906
#define DST(x,v) sum += FFABS(v)
3907
    for( i = 0; i < 8; i++ )
3908
        DCT8_1D
3909
#undef SRC
3910
#undef DST
3911
    return sum;
3912
}
3913
#endif
3914

    
3915
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3916
    MpegEncContext * const s= (MpegEncContext *)c;
3917
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3918
    int sum=0, i;
3919

    
3920
    assert(h==8);
3921

    
3922
    s->dsp.diff_pixels(temp, src1, src2, stride);
3923
    s->dsp.fdct(temp);
3924

    
3925
    for(i=0; i<64; i++)
3926
        sum= FFMAX(sum, FFABS(temp[i]));
3927

    
3928
    return sum;
3929
}
3930

    
3931
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3932
    MpegEncContext * const s= (MpegEncContext *)c;
3933
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3934
    DCTELEM * const bak = temp+64;
3935
    int sum=0, i;
3936

    
3937
    assert(h==8);
3938
    s->mb_intra=0;
3939

    
3940
    s->dsp.diff_pixels(temp, src1, src2, stride);
3941

    
3942
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3943

    
3944
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3945
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3946
    ff_simple_idct(temp); //FIXME
3947

    
3948
    for(i=0; i<64; i++)
3949
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3950

    
3951
    return sum;
3952
}
3953

    
3954
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3955
    MpegEncContext * const s= (MpegEncContext *)c;
3956
    const uint8_t *scantable= s->intra_scantable.permutated;
3957
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3958
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3959
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3960
    int i, last, run, bits, level, distortion, start_i;
3961
    const int esc_length= s->ac_esc_length;
3962
    uint8_t * length;
3963
    uint8_t * last_length;
3964

    
3965
    assert(h==8);
3966

    
3967
    copy_block8(lsrc1, src1, 8, stride, 8);
3968
    copy_block8(lsrc2, src2, 8, stride, 8);
3969

    
3970
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3971

    
3972
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3973

    
3974
    bits=0;
3975

    
3976
    if (s->mb_intra) {
3977
        start_i = 1;
3978
        length     = s->intra_ac_vlc_length;
3979
        last_length= s->intra_ac_vlc_last_length;
3980
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3981
    } else {
3982
        start_i = 0;
3983
        length     = s->inter_ac_vlc_length;
3984
        last_length= s->inter_ac_vlc_last_length;
3985
    }
3986

    
3987
    if(last>=start_i){
3988
        run=0;
3989
        for(i=start_i; i<last; i++){
3990
            int j= scantable[i];
3991
            level= temp[j];
3992

    
3993
            if(level){
3994
                level+=64;
3995
                if((level&(~127)) == 0){
3996
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3997
                }else
3998
                    bits+= esc_length;
3999
                run=0;
4000
            }else
4001
                run++;
4002
        }
4003
        i= scantable[last];
4004

    
4005
        level= temp[i] + 64;
4006

    
4007
        assert(level - 64);
4008

    
4009
        if((level&(~127)) == 0){
4010
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4011
        }else
4012
            bits+= esc_length;
4013

    
4014
    }
4015

    
4016
    if(last>=0){
4017
        if(s->mb_intra)
4018
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
4019
        else
4020
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
4021
    }
4022

    
4023
    s->dsp.idct_add(lsrc2, 8, temp);
4024

    
4025
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
4026

    
4027
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
4028
}
4029

    
4030
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
4031
    MpegEncContext * const s= (MpegEncContext *)c;
4032
    const uint8_t *scantable= s->intra_scantable.permutated;
4033
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
4034
    int i, last, run, bits, level, start_i;
4035
    const int esc_length= s->ac_esc_length;
4036
    uint8_t * length;
4037
    uint8_t * last_length;
4038

    
4039
    assert(h==8);
4040

    
4041
    s->dsp.diff_pixels(temp, src1, src2, stride);
4042

    
4043
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
4044

    
4045
    bits=0;
4046

    
4047
    if (s->mb_intra) {
4048
        start_i = 1;
4049
        length     = s->intra_ac_vlc_length;
4050
        last_length= s->intra_ac_vlc_last_length;
4051
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
4052
    } else {
4053
        start_i = 0;
4054
        length     = s->inter_ac_vlc_length;
4055
        last_length= s->inter_ac_vlc_last_length;
4056
    }
4057

    
4058
    if(last>=start_i){
4059
        run=0;
4060
        for(i=start_i; i<last; i++){
4061
            int j= scantable[i];
4062
            level= temp[j];
4063

    
4064
            if(level){
4065
                level+=64;
4066
                if((level&(~127)) == 0){
4067
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
4068
                }else
4069
                    bits+= esc_length;
4070
                run=0;
4071
            }else
4072
                run++;
4073
        }
4074
        i= scantable[last];
4075

    
4076
        level= temp[i] + 64;
4077

    
4078
        assert(level - 64);
4079

    
4080
        if((level&(~127)) == 0){
4081
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4082
        }else
4083
            bits+= esc_length;
4084
    }
4085

    
4086
    return bits;
4087
}
4088

    
4089
#define VSAD_INTRA(size) \
4090
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4091
    int score=0;                                                                                            \
4092
    int x,y;                                                                                                \
4093
                                                                                                            \
4094
    for(y=1; y<h; y++){                                                                                     \
4095
        for(x=0; x<size; x+=4){                                                                             \
4096
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4097
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4098
        }                                                                                                   \
4099
        s+= stride;                                                                                         \
4100
    }                                                                                                       \
4101
                                                                                                            \
4102
    return score;                                                                                           \
4103
}
4104
VSAD_INTRA(8)
4105
VSAD_INTRA(16)
4106

    
4107
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4108
    int score=0;
4109
    int x,y;
4110

    
4111
    for(y=1; y<h; y++){
4112
        for(x=0; x<16; x++){
4113
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4114
        }
4115
        s1+= stride;
4116
        s2+= stride;
4117
    }
4118

    
4119
    return score;
4120
}
4121

    
4122
#define SQ(a) ((a)*(a))
4123
#define VSSE_INTRA(size) \
4124
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4125
    int score=0;                                                                                            \
4126
    int x,y;                                                                                                \
4127
                                                                                                            \
4128
    for(y=1; y<h; y++){                                                                                     \
4129
        for(x=0; x<size; x+=4){                                                                               \
4130
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4131
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4132
        }                                                                                                   \
4133
        s+= stride;                                                                                         \
4134
    }                                                                                                       \
4135
                                                                                                            \
4136
    return score;                                                                                           \
4137
}
4138
VSSE_INTRA(8)
4139
VSSE_INTRA(16)
4140

    
4141
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4142
    int score=0;
4143
    int x,y;
4144

    
4145
    for(y=1; y<h; y++){
4146
        for(x=0; x<16; x++){
4147
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4148
        }
4149
        s1+= stride;
4150
        s2+= stride;
4151
    }
4152

    
4153
    return score;
4154
}
4155

    
4156
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4157
                               int size){
4158
    int score=0;
4159
    int i;
4160
    for(i=0; i<size; i++)
4161
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4162
    return score;
4163
}
4164

    
4165
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4166
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4167
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4168
#if CONFIG_GPL
4169
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4170
#endif
4171
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4172
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4173
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4174
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4175

    
4176
static void vector_fmul_c(float *dst, const float *src, int len){
4177
    int i;
4178
    for(i=0; i<len; i++)
4179
        dst[i] *= src[i];
4180
}
4181

    
4182
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4183
    int i;
4184
    src1 += len-1;
4185
    for(i=0; i<len; i++)
4186
        dst[i] = src0[i] * src1[-i];
4187
}
4188

    
4189
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4190
    int i;
4191
    for(i=0; i<len; i++)
4192
        dst[i] = src0[i] * src1[i] + src2[i];
4193
}
4194

    
4195
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4196
    int i,j;
4197
    dst += len;
4198
    win += len;
4199
    src0+= len;
4200
    for(i=-len, j=len-1; i<0; i++, j--) {
4201
        float s0 = src0[i];
4202
        float s1 = src1[j];
4203
        float wi = win[i];
4204
        float wj = win[j];
4205
        dst[i] = s0*wj - s1*wi + add_bias;
4206
        dst[j] = s0*wi + s1*wj + add_bias;
4207
    }
4208
}
4209

    
4210
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4211
                                 int len)
4212
{
4213
    int i;
4214
    for (i = 0; i < len; i++)
4215
        dst[i] = src[i] * mul;
4216
}
4217

    
4218
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4219
                                      const float **sv, float mul, int len)
4220
{
4221
    int i;
4222
    for (i = 0; i < len; i += 2, sv++) {
4223
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4224
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4225
    }
4226
}
4227

    
4228
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4229
                                      const float **sv, float mul, int len)
4230
{
4231
    int i;
4232
    for (i = 0; i < len; i += 4, sv++) {
4233
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4234
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4235
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4236
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4237
    }
4238