Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 49fb20cb

History | View | Annotate | Download (164 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "h263.h"
36
#include "snow.h"
37

    
38
/* snow.c */
39
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40

    
41
/* vorbis.c */
42
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43

    
44
/* ac3dec.c */
45
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
46

    
47
/* flacenc.c */
48
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
49

    
50
/* pngdec.c */
51
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
52

    
53
/* eaidct.c */
54
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
55

    
56
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
57
uint32_t ff_squareTbl[512] = {0, };
58

    
59
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
60
#define pb_7f (~0UL/255 * 0x7f)
61
#define pb_80 (~0UL/255 * 0x80)
62

    
63
const uint8_t ff_zigzag_direct[64] = {
64
    0,   1,  8, 16,  9,  2,  3, 10,
65
    17, 24, 32, 25, 18, 11,  4,  5,
66
    12, 19, 26, 33, 40, 48, 41, 34,
67
    27, 20, 13,  6,  7, 14, 21, 28,
68
    35, 42, 49, 56, 57, 50, 43, 36,
69
    29, 22, 15, 23, 30, 37, 44, 51,
70
    58, 59, 52, 45, 38, 31, 39, 46,
71
    53, 60, 61, 54, 47, 55, 62, 63
72
};
73

    
74
/* Specific zigzag scan for 248 idct. NOTE that unlike the
75
   specification, we interleave the fields */
76
const uint8_t ff_zigzag248_direct[64] = {
77
     0,  8,  1,  9, 16, 24,  2, 10,
78
    17, 25, 32, 40, 48, 56, 33, 41,
79
    18, 26,  3, 11,  4, 12, 19, 27,
80
    34, 42, 49, 57, 50, 58, 35, 43,
81
    20, 28,  5, 13,  6, 14, 21, 29,
82
    36, 44, 51, 59, 52, 60, 37, 45,
83
    22, 30,  7, 15, 23, 31, 38, 46,
84
    53, 61, 54, 62, 39, 47, 55, 63,
85
};
86

    
87
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
88
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
89

    
90
const uint8_t ff_alternate_horizontal_scan[64] = {
91
    0,  1,   2,  3,  8,  9, 16, 17,
92
    10, 11,  4,  5,  6,  7, 15, 14,
93
    13, 12, 19, 18, 24, 25, 32, 33,
94
    26, 27, 20, 21, 22, 23, 28, 29,
95
    30, 31, 34, 35, 40, 41, 48, 49,
96
    42, 43, 36, 37, 38, 39, 44, 45,
97
    46, 47, 50, 51, 56, 57, 58, 59,
98
    52, 53, 54, 55, 60, 61, 62, 63,
99
};
100

    
101
const uint8_t ff_alternate_vertical_scan[64] = {
102
    0,  8,  16, 24,  1,  9,  2, 10,
103
    17, 25, 32, 40, 48, 56, 57, 49,
104
    41, 33, 26, 18,  3, 11,  4, 12,
105
    19, 27, 34, 42, 50, 58, 35, 43,
106
    51, 59, 20, 28,  5, 13,  6, 14,
107
    21, 29, 36, 44, 52, 60, 37, 45,
108
    53, 61, 22, 30,  7, 15, 23, 31,
109
    38, 46, 54, 62, 39, 47, 55, 63,
110
};
111

    
112
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
113
const uint32_t ff_inverse[256]={
114
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
115
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
116
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
117
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
118
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
119
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
120
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
121
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
122
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
123
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
124
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
125
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
126
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
127
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
128
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
129
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
130
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
131
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
132
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
133
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
134
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
135
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
136
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
137
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
138
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
139
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
140
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
141
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
142
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
143
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
144
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
145
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
146
};
147

    
148
/* Input permutation for the simple_idct_mmx */
149
static const uint8_t simple_mmx_permutation[64]={
150
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
151
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
152
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
153
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
154
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
155
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
156
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
157
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
158
};
159

    
160
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
161

    
162
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
163
    int i;
164
    int end;
165

    
166
    st->scantable= src_scantable;
167

    
168
    for(i=0; i<64; i++){
169
        int j;
170
        j = src_scantable[i];
171
        st->permutated[i] = permutation[j];
172
#if ARCH_PPC
173
        st->inverse[j] = i;
174
#endif
175
    }
176

    
177
    end=-1;
178
    for(i=0; i<64; i++){
179
        int j;
180
        j = st->permutated[i];
181
        if(j>end) end=j;
182
        st->raster_end[i]= end;
183
    }
184
}
185

    
186
static int pix_sum_c(uint8_t * pix, int line_size)
187
{
188
    int s, i, j;
189

    
190
    s = 0;
191
    for (i = 0; i < 16; i++) {
192
        for (j = 0; j < 16; j += 8) {
193
            s += pix[0];
194
            s += pix[1];
195
            s += pix[2];
196
            s += pix[3];
197
            s += pix[4];
198
            s += pix[5];
199
            s += pix[6];
200
            s += pix[7];
201
            pix += 8;
202
        }
203
        pix += line_size - 16;
204
    }
205
    return s;
206
}
207

    
208
static int pix_norm1_c(uint8_t * pix, int line_size)
209
{
210
    int s, i, j;
211
    uint32_t *sq = ff_squareTbl + 256;
212

    
213
    s = 0;
214
    for (i = 0; i < 16; i++) {
215
        for (j = 0; j < 16; j += 8) {
216
#if 0
217
            s += sq[pix[0]];
218
            s += sq[pix[1]];
219
            s += sq[pix[2]];
220
            s += sq[pix[3]];
221
            s += sq[pix[4]];
222
            s += sq[pix[5]];
223
            s += sq[pix[6]];
224
            s += sq[pix[7]];
225
#else
226
#if LONG_MAX > 2147483647
227
            register uint64_t x=*(uint64_t*)pix;
228
            s += sq[x&0xff];
229
            s += sq[(x>>8)&0xff];
230
            s += sq[(x>>16)&0xff];
231
            s += sq[(x>>24)&0xff];
232
            s += sq[(x>>32)&0xff];
233
            s += sq[(x>>40)&0xff];
234
            s += sq[(x>>48)&0xff];
235
            s += sq[(x>>56)&0xff];
236
#else
237
            register uint32_t x=*(uint32_t*)pix;
238
            s += sq[x&0xff];
239
            s += sq[(x>>8)&0xff];
240
            s += sq[(x>>16)&0xff];
241
            s += sq[(x>>24)&0xff];
242
            x=*(uint32_t*)(pix+4);
243
            s += sq[x&0xff];
244
            s += sq[(x>>8)&0xff];
245
            s += sq[(x>>16)&0xff];
246
            s += sq[(x>>24)&0xff];
247
#endif
248
#endif
249
            pix += 8;
250
        }
251
        pix += line_size - 16;
252
    }
253
    return s;
254
}
255

    
256
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
257
    int i;
258

    
259
    for(i=0; i+8<=w; i+=8){
260
        dst[i+0]= bswap_32(src[i+0]);
261
        dst[i+1]= bswap_32(src[i+1]);
262
        dst[i+2]= bswap_32(src[i+2]);
263
        dst[i+3]= bswap_32(src[i+3]);
264
        dst[i+4]= bswap_32(src[i+4]);
265
        dst[i+5]= bswap_32(src[i+5]);
266
        dst[i+6]= bswap_32(src[i+6]);
267
        dst[i+7]= bswap_32(src[i+7]);
268
    }
269
    for(;i<w; i++){
270
        dst[i+0]= bswap_32(src[i+0]);
271
    }
272
}
273

    
274
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
275
{
276
    int s, i;
277
    uint32_t *sq = ff_squareTbl + 256;
278

    
279
    s = 0;
280
    for (i = 0; i < h; i++) {
281
        s += sq[pix1[0] - pix2[0]];
282
        s += sq[pix1[1] - pix2[1]];
283
        s += sq[pix1[2] - pix2[2]];
284
        s += sq[pix1[3] - pix2[3]];
285
        pix1 += line_size;
286
        pix2 += line_size;
287
    }
288
    return s;
289
}
290

    
291
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
292
{
293
    int s, i;
294
    uint32_t *sq = ff_squareTbl + 256;
295

    
296
    s = 0;
297
    for (i = 0; i < h; i++) {
298
        s += sq[pix1[0] - pix2[0]];
299
        s += sq[pix1[1] - pix2[1]];
300
        s += sq[pix1[2] - pix2[2]];
301
        s += sq[pix1[3] - pix2[3]];
302
        s += sq[pix1[4] - pix2[4]];
303
        s += sq[pix1[5] - pix2[5]];
304
        s += sq[pix1[6] - pix2[6]];
305
        s += sq[pix1[7] - pix2[7]];
306
        pix1 += line_size;
307
        pix2 += line_size;
308
    }
309
    return s;
310
}
311

    
312
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
313
{
314
    int s, i;
315
    uint32_t *sq = ff_squareTbl + 256;
316

    
317
    s = 0;
318
    for (i = 0; i < h; i++) {
319
        s += sq[pix1[ 0] - pix2[ 0]];
320
        s += sq[pix1[ 1] - pix2[ 1]];
321
        s += sq[pix1[ 2] - pix2[ 2]];
322
        s += sq[pix1[ 3] - pix2[ 3]];
323
        s += sq[pix1[ 4] - pix2[ 4]];
324
        s += sq[pix1[ 5] - pix2[ 5]];
325
        s += sq[pix1[ 6] - pix2[ 6]];
326
        s += sq[pix1[ 7] - pix2[ 7]];
327
        s += sq[pix1[ 8] - pix2[ 8]];
328
        s += sq[pix1[ 9] - pix2[ 9]];
329
        s += sq[pix1[10] - pix2[10]];
330
        s += sq[pix1[11] - pix2[11]];
331
        s += sq[pix1[12] - pix2[12]];
332
        s += sq[pix1[13] - pix2[13]];
333
        s += sq[pix1[14] - pix2[14]];
334
        s += sq[pix1[15] - pix2[15]];
335

    
336
        pix1 += line_size;
337
        pix2 += line_size;
338
    }
339
    return s;
340
}
341

    
342

    
343
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
344
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
345
    int s, i, j;
346
    const int dec_count= w==8 ? 3 : 4;
347
    int tmp[32*32];
348
    int level, ori;
349
    static const int scale[2][2][4][4]={
350
      {
351
        {
352
            // 9/7 8x8 dec=3
353
            {268, 239, 239, 213},
354
            {  0, 224, 224, 152},
355
            {  0, 135, 135, 110},
356
        },{
357
            // 9/7 16x16 or 32x32 dec=4
358
            {344, 310, 310, 280},
359
            {  0, 320, 320, 228},
360
            {  0, 175, 175, 136},
361
            {  0, 129, 129, 102},
362
        }
363
      },{
364
        {
365
            // 5/3 8x8 dec=3
366
            {275, 245, 245, 218},
367
            {  0, 230, 230, 156},
368
            {  0, 138, 138, 113},
369
        },{
370
            // 5/3 16x16 or 32x32 dec=4
371
            {352, 317, 317, 286},
372
            {  0, 328, 328, 233},
373
            {  0, 180, 180, 140},
374
            {  0, 132, 132, 105},
375
        }
376
      }
377
    };
378

    
379
    for (i = 0; i < h; i++) {
380
        for (j = 0; j < w; j+=4) {
381
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
382
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
383
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
384
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
385
        }
386
        pix1 += line_size;
387
        pix2 += line_size;
388
    }
389

    
390
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
391

    
392
    s=0;
393
    assert(w==h);
394
    for(level=0; level<dec_count; level++){
395
        for(ori= level ? 1 : 0; ori<4; ori++){
396
            int size= w>>(dec_count-level);
397
            int sx= (ori&1) ? size : 0;
398
            int stride= 32<<(dec_count-level);
399
            int sy= (ori&2) ? stride>>1 : 0;
400

    
401
            for(i=0; i<size; i++){
402
                for(j=0; j<size; j++){
403
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
404
                    s += FFABS(v);
405
                }
406
            }
407
        }
408
    }
409
    assert(s>=0);
410
    return s>>9;
411
}
412

    
413
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
414
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
415
}
416

    
417
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
419
}
420

    
421
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
423
}
424

    
425
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
427
}
428

    
429
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
431
}
432

    
433
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
435
}
436
#endif
437

    
438
/* draw the edges of width 'w' of an image of size width, height */
439
//FIXME check that this is ok for mpeg4 interlaced
440
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
441
{
442
    uint8_t *ptr, *last_line;
443
    int i;
444

    
445
    last_line = buf + (height - 1) * wrap;
446
    for(i=0;i<w;i++) {
447
        /* top and bottom */
448
        memcpy(buf - (i + 1) * wrap, buf, width);
449
        memcpy(last_line + (i + 1) * wrap, last_line, width);
450
    }
451
    /* left and right */
452
    ptr = buf;
453
    for(i=0;i<height;i++) {
454
        memset(ptr - w, ptr[0], w);
455
        memset(ptr + width, ptr[width-1], w);
456
        ptr += wrap;
457
    }
458
    /* corners */
459
    for(i=0;i<w;i++) {
460
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
461
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
462
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
463
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
464
    }
465
}
466

    
467
/**
468
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
469
 * @param buf destination buffer
470
 * @param src source buffer
471
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
472
 * @param block_w width of block
473
 * @param block_h height of block
474
 * @param src_x x coordinate of the top left sample of the block in the source buffer
475
 * @param src_y y coordinate of the top left sample of the block in the source buffer
476
 * @param w width of the source buffer
477
 * @param h height of the source buffer
478
 */
479
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
480
                                    int src_x, int src_y, int w, int h){
481
    int x, y;
482
    int start_y, start_x, end_y, end_x;
483

    
484
    if(src_y>= h){
485
        src+= (h-1-src_y)*linesize;
486
        src_y=h-1;
487
    }else if(src_y<=-block_h){
488
        src+= (1-block_h-src_y)*linesize;
489
        src_y=1-block_h;
490
    }
491
    if(src_x>= w){
492
        src+= (w-1-src_x);
493
        src_x=w-1;
494
    }else if(src_x<=-block_w){
495
        src+= (1-block_w-src_x);
496
        src_x=1-block_w;
497
    }
498

    
499
    start_y= FFMAX(0, -src_y);
500
    start_x= FFMAX(0, -src_x);
501
    end_y= FFMIN(block_h, h-src_y);
502
    end_x= FFMIN(block_w, w-src_x);
503

    
504
    // copy existing part
505
    for(y=start_y; y<end_y; y++){
506
        for(x=start_x; x<end_x; x++){
507
            buf[x + y*linesize]= src[x + y*linesize];
508
        }
509
    }
510

    
511
    //top
512
    for(y=0; y<start_y; y++){
513
        for(x=start_x; x<end_x; x++){
514
            buf[x + y*linesize]= buf[x + start_y*linesize];
515
        }
516
    }
517

    
518
    //bottom
519
    for(y=end_y; y<block_h; y++){
520
        for(x=start_x; x<end_x; x++){
521
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
522
        }
523
    }
524

    
525
    for(y=0; y<block_h; y++){
526
       //left
527
        for(x=0; x<start_x; x++){
528
            buf[x + y*linesize]= buf[start_x + y*linesize];
529
        }
530

    
531
       //right
532
        for(x=end_x; x<block_w; x++){
533
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
534
        }
535
    }
536
}
537

    
538
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
539
{
540
    int i;
541

    
542
    /* read the pixels */
543
    for(i=0;i<8;i++) {
544
        block[0] = pixels[0];
545
        block[1] = pixels[1];
546
        block[2] = pixels[2];
547
        block[3] = pixels[3];
548
        block[4] = pixels[4];
549
        block[5] = pixels[5];
550
        block[6] = pixels[6];
551
        block[7] = pixels[7];
552
        pixels += line_size;
553
        block += 8;
554
    }
555
}
556

    
557
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
558
                          const uint8_t *s2, int stride){
559
    int i;
560

    
561
    /* read the pixels */
562
    for(i=0;i<8;i++) {
563
        block[0] = s1[0] - s2[0];
564
        block[1] = s1[1] - s2[1];
565
        block[2] = s1[2] - s2[2];
566
        block[3] = s1[3] - s2[3];
567
        block[4] = s1[4] - s2[4];
568
        block[5] = s1[5] - s2[5];
569
        block[6] = s1[6] - s2[6];
570
        block[7] = s1[7] - s2[7];
571
        s1 += stride;
572
        s2 += stride;
573
        block += 8;
574
    }
575
}
576

    
577

    
578
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
579
                                 int line_size)
580
{
581
    int i;
582
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
583

    
584
    /* read the pixels */
585
    for(i=0;i<8;i++) {
586
        pixels[0] = cm[block[0]];
587
        pixels[1] = cm[block[1]];
588
        pixels[2] = cm[block[2]];
589
        pixels[3] = cm[block[3]];
590
        pixels[4] = cm[block[4]];
591
        pixels[5] = cm[block[5]];
592
        pixels[6] = cm[block[6]];
593
        pixels[7] = cm[block[7]];
594

    
595
        pixels += line_size;
596
        block += 8;
597
    }
598
}
599

    
600
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
601
                                 int line_size)
602
{
603
    int i;
604
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
605

    
606
    /* read the pixels */
607
    for(i=0;i<4;i++) {
608
        pixels[0] = cm[block[0]];
609
        pixels[1] = cm[block[1]];
610
        pixels[2] = cm[block[2]];
611
        pixels[3] = cm[block[3]];
612

    
613
        pixels += line_size;
614
        block += 8;
615
    }
616
}
617

    
618
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
619
                                 int line_size)
620
{
621
    int i;
622
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
623

    
624
    /* read the pixels */
625
    for(i=0;i<2;i++) {
626
        pixels[0] = cm[block[0]];
627
        pixels[1] = cm[block[1]];
628

    
629
        pixels += line_size;
630
        block += 8;
631
    }
632
}
633

    
634
static void put_signed_pixels_clamped_c(const DCTELEM *block,
635
                                        uint8_t *restrict pixels,
636
                                        int line_size)
637
{
638
    int i, j;
639

    
640
    for (i = 0; i < 8; i++) {
641
        for (j = 0; j < 8; j++) {
642
            if (*block < -128)
643
                *pixels = 0;
644
            else if (*block > 127)
645
                *pixels = 255;
646
            else
647
                *pixels = (uint8_t)(*block + 128);
648
            block++;
649
            pixels++;
650
        }
651
        pixels += (line_size - 8);
652
    }
653
}
654

    
655
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
656
                          int line_size)
657
{
658
    int i;
659
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
660

    
661
    /* read the pixels */
662
    for(i=0;i<8;i++) {
663
        pixels[0] = cm[pixels[0] + block[0]];
664
        pixels[1] = cm[pixels[1] + block[1]];
665
        pixels[2] = cm[pixels[2] + block[2]];
666
        pixels[3] = cm[pixels[3] + block[3]];
667
        pixels[4] = cm[pixels[4] + block[4]];
668
        pixels[5] = cm[pixels[5] + block[5]];
669
        pixels[6] = cm[pixels[6] + block[6]];
670
        pixels[7] = cm[pixels[7] + block[7]];
671
        pixels += line_size;
672
        block += 8;
673
    }
674
}
675

    
676
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
677
                          int line_size)
678
{
679
    int i;
680
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
681

    
682
    /* read the pixels */
683
    for(i=0;i<4;i++) {
684
        pixels[0] = cm[pixels[0] + block[0]];
685
        pixels[1] = cm[pixels[1] + block[1]];
686
        pixels[2] = cm[pixels[2] + block[2]];
687
        pixels[3] = cm[pixels[3] + block[3]];
688
        pixels += line_size;
689
        block += 8;
690
    }
691
}
692

    
693
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
694
                          int line_size)
695
{
696
    int i;
697
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
698

    
699
    /* read the pixels */
700
    for(i=0;i<2;i++) {
701
        pixels[0] = cm[pixels[0] + block[0]];
702
        pixels[1] = cm[pixels[1] + block[1]];
703
        pixels += line_size;
704
        block += 8;
705
    }
706
}
707

    
708
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
709
{
710
    int i;
711
    for(i=0;i<8;i++) {
712
        pixels[0] += block[0];
713
        pixels[1] += block[1];
714
        pixels[2] += block[2];
715
        pixels[3] += block[3];
716
        pixels[4] += block[4];
717
        pixels[5] += block[5];
718
        pixels[6] += block[6];
719
        pixels[7] += block[7];
720
        pixels += line_size;
721
        block += 8;
722
    }
723
}
724

    
725
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
726
{
727
    int i;
728
    for(i=0;i<4;i++) {
729
        pixels[0] += block[0];
730
        pixels[1] += block[1];
731
        pixels[2] += block[2];
732
        pixels[3] += block[3];
733
        pixels += line_size;
734
        block += 4;
735
    }
736
}
737

    
738
static int sum_abs_dctelem_c(DCTELEM *block)
739
{
740
    int sum=0, i;
741
    for(i=0; i<64; i++)
742
        sum+= FFABS(block[i]);
743
    return sum;
744
}
745

    
746
#if 0
747

748
#define PIXOP2(OPNAME, OP) \
749
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
750
{\
751
    int i;\
752
    for(i=0; i<h; i++){\
753
        OP(*((uint64_t*)block), AV_RN64(pixels));\
754
        pixels+=line_size;\
755
        block +=line_size;\
756
    }\
757
}\
758
\
759
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760
{\
761
    int i;\
762
    for(i=0; i<h; i++){\
763
        const uint64_t a= AV_RN64(pixels  );\
764
        const uint64_t b= AV_RN64(pixels+1);\
765
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
766
        pixels+=line_size;\
767
        block +=line_size;\
768
    }\
769
}\
770
\
771
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
772
{\
773
    int i;\
774
    for(i=0; i<h; i++){\
775
        const uint64_t a= AV_RN64(pixels  );\
776
        const uint64_t b= AV_RN64(pixels+1);\
777
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
778
        pixels+=line_size;\
779
        block +=line_size;\
780
    }\
781
}\
782
\
783
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
784
{\
785
    int i;\
786
    for(i=0; i<h; i++){\
787
        const uint64_t a= AV_RN64(pixels          );\
788
        const uint64_t b= AV_RN64(pixels+line_size);\
789
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
790
        pixels+=line_size;\
791
        block +=line_size;\
792
    }\
793
}\
794
\
795
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
796
{\
797
    int i;\
798
    for(i=0; i<h; i++){\
799
        const uint64_t a= AV_RN64(pixels          );\
800
        const uint64_t b= AV_RN64(pixels+line_size);\
801
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
802
        pixels+=line_size;\
803
        block +=line_size;\
804
    }\
805
}\
806
\
807
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
808
{\
809
        int i;\
810
        const uint64_t a= AV_RN64(pixels  );\
811
        const uint64_t b= AV_RN64(pixels+1);\
812
        uint64_t l0=  (a&0x0303030303030303ULL)\
813
                    + (b&0x0303030303030303ULL)\
814
                    + 0x0202020202020202ULL;\
815
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
816
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
817
        uint64_t l1,h1;\
818
\
819
        pixels+=line_size;\
820
        for(i=0; i<h; i+=2){\
821
            uint64_t a= AV_RN64(pixels  );\
822
            uint64_t b= AV_RN64(pixels+1);\
823
            l1=  (a&0x0303030303030303ULL)\
824
               + (b&0x0303030303030303ULL);\
825
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
826
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
827
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
828
            pixels+=line_size;\
829
            block +=line_size;\
830
            a= AV_RN64(pixels  );\
831
            b= AV_RN64(pixels+1);\
832
            l0=  (a&0x0303030303030303ULL)\
833
               + (b&0x0303030303030303ULL)\
834
               + 0x0202020202020202ULL;\
835
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
836
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
837
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
838
            pixels+=line_size;\
839
            block +=line_size;\
840
        }\
841
}\
842
\
843
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
844
{\
845
        int i;\
846
        const uint64_t a= AV_RN64(pixels  );\
847
        const uint64_t b= AV_RN64(pixels+1);\
848
        uint64_t l0=  (a&0x0303030303030303ULL)\
849
                    + (b&0x0303030303030303ULL)\
850
                    + 0x0101010101010101ULL;\
851
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
852
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
853
        uint64_t l1,h1;\
854
\
855
        pixels+=line_size;\
856
        for(i=0; i<h; i+=2){\
857
            uint64_t a= AV_RN64(pixels  );\
858
            uint64_t b= AV_RN64(pixels+1);\
859
            l1=  (a&0x0303030303030303ULL)\
860
               + (b&0x0303030303030303ULL);\
861
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
862
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
863
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
864
            pixels+=line_size;\
865
            block +=line_size;\
866
            a= AV_RN64(pixels  );\
867
            b= AV_RN64(pixels+1);\
868
            l0=  (a&0x0303030303030303ULL)\
869
               + (b&0x0303030303030303ULL)\
870
               + 0x0101010101010101ULL;\
871
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
872
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
873
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
874
            pixels+=line_size;\
875
            block +=line_size;\
876
        }\
877
}\
878
\
879
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
880
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
881
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
882
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
883
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
884
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
886

887
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
888
#else // 64 bit variant
889

    
890
#define PIXOP2(OPNAME, OP) \
891
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892
    int i;\
893
    for(i=0; i<h; i++){\
894
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
895
        pixels+=line_size;\
896
        block +=line_size;\
897
    }\
898
}\
899
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900
    int i;\
901
    for(i=0; i<h; i++){\
902
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
903
        pixels+=line_size;\
904
        block +=line_size;\
905
    }\
906
}\
907
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908
    int i;\
909
    for(i=0; i<h; i++){\
910
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
911
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
912
        pixels+=line_size;\
913
        block +=line_size;\
914
    }\
915
}\
916
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
918
}\
919
\
920
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921
                                                int src_stride1, int src_stride2, int h){\
922
    int i;\
923
    for(i=0; i<h; i++){\
924
        uint32_t a,b;\
925
        a= AV_RN32(&src1[i*src_stride1  ]);\
926
        b= AV_RN32(&src2[i*src_stride2  ]);\
927
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
928
        a= AV_RN32(&src1[i*src_stride1+4]);\
929
        b= AV_RN32(&src2[i*src_stride2+4]);\
930
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
931
    }\
932
}\
933
\
934
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
935
                                                int src_stride1, int src_stride2, int h){\
936
    int i;\
937
    for(i=0; i<h; i++){\
938
        uint32_t a,b;\
939
        a= AV_RN32(&src1[i*src_stride1  ]);\
940
        b= AV_RN32(&src2[i*src_stride2  ]);\
941
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
942
        a= AV_RN32(&src1[i*src_stride1+4]);\
943
        b= AV_RN32(&src2[i*src_stride2+4]);\
944
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
945
    }\
946
}\
947
\
948
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
949
                                                int src_stride1, int src_stride2, int h){\
950
    int i;\
951
    for(i=0; i<h; i++){\
952
        uint32_t a,b;\
953
        a= AV_RN32(&src1[i*src_stride1  ]);\
954
        b= AV_RN32(&src2[i*src_stride2  ]);\
955
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
956
    }\
957
}\
958
\
959
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
960
                                                int src_stride1, int src_stride2, int h){\
961
    int i;\
962
    for(i=0; i<h; i++){\
963
        uint32_t a,b;\
964
        a= AV_RN16(&src1[i*src_stride1  ]);\
965
        b= AV_RN16(&src2[i*src_stride2  ]);\
966
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
967
    }\
968
}\
969
\
970
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971
                                                int src_stride1, int src_stride2, int h){\
972
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
973
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
974
}\
975
\
976
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
977
                                                int src_stride1, int src_stride2, int h){\
978
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
979
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
980
}\
981
\
982
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
984
}\
985
\
986
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
988
}\
989
\
990
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
992
}\
993
\
994
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
996
}\
997
\
998
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
999
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000
    int i;\
1001
    for(i=0; i<h; i++){\
1002
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1003
        a= AV_RN32(&src1[i*src_stride1]);\
1004
        b= AV_RN32(&src2[i*src_stride2]);\
1005
        c= AV_RN32(&src3[i*src_stride3]);\
1006
        d= AV_RN32(&src4[i*src_stride4]);\
1007
        l0=  (a&0x03030303UL)\
1008
           + (b&0x03030303UL)\
1009
           + 0x02020202UL;\
1010
        h0= ((a&0xFCFCFCFCUL)>>2)\
1011
          + ((b&0xFCFCFCFCUL)>>2);\
1012
        l1=  (c&0x03030303UL)\
1013
           + (d&0x03030303UL);\
1014
        h1= ((c&0xFCFCFCFCUL)>>2)\
1015
          + ((d&0xFCFCFCFCUL)>>2);\
1016
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017
        a= AV_RN32(&src1[i*src_stride1+4]);\
1018
        b= AV_RN32(&src2[i*src_stride2+4]);\
1019
        c= AV_RN32(&src3[i*src_stride3+4]);\
1020
        d= AV_RN32(&src4[i*src_stride4+4]);\
1021
        l0=  (a&0x03030303UL)\
1022
           + (b&0x03030303UL)\
1023
           + 0x02020202UL;\
1024
        h0= ((a&0xFCFCFCFCUL)>>2)\
1025
          + ((b&0xFCFCFCFCUL)>>2);\
1026
        l1=  (c&0x03030303UL)\
1027
           + (d&0x03030303UL);\
1028
        h1= ((c&0xFCFCFCFCUL)>>2)\
1029
          + ((d&0xFCFCFCFCUL)>>2);\
1030
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1031
    }\
1032
}\
1033
\
1034
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1036
}\
1037
\
1038
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1040
}\
1041
\
1042
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1044
}\
1045
\
1046
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1048
}\
1049
\
1050
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1052
    int i;\
1053
    for(i=0; i<h; i++){\
1054
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1055
        a= AV_RN32(&src1[i*src_stride1]);\
1056
        b= AV_RN32(&src2[i*src_stride2]);\
1057
        c= AV_RN32(&src3[i*src_stride3]);\
1058
        d= AV_RN32(&src4[i*src_stride4]);\
1059
        l0=  (a&0x03030303UL)\
1060
           + (b&0x03030303UL)\
1061
           + 0x01010101UL;\
1062
        h0= ((a&0xFCFCFCFCUL)>>2)\
1063
          + ((b&0xFCFCFCFCUL)>>2);\
1064
        l1=  (c&0x03030303UL)\
1065
           + (d&0x03030303UL);\
1066
        h1= ((c&0xFCFCFCFCUL)>>2)\
1067
          + ((d&0xFCFCFCFCUL)>>2);\
1068
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069
        a= AV_RN32(&src1[i*src_stride1+4]);\
1070
        b= AV_RN32(&src2[i*src_stride2+4]);\
1071
        c= AV_RN32(&src3[i*src_stride3+4]);\
1072
        d= AV_RN32(&src4[i*src_stride4+4]);\
1073
        l0=  (a&0x03030303UL)\
1074
           + (b&0x03030303UL)\
1075
           + 0x01010101UL;\
1076
        h0= ((a&0xFCFCFCFCUL)>>2)\
1077
          + ((b&0xFCFCFCFCUL)>>2);\
1078
        l1=  (c&0x03030303UL)\
1079
           + (d&0x03030303UL);\
1080
        h1= ((c&0xFCFCFCFCUL)>>2)\
1081
          + ((d&0xFCFCFCFCUL)>>2);\
1082
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083
    }\
1084
}\
1085
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089
}\
1090
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094
}\
1095
\
1096
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1097
{\
1098
        int i, a0, b0, a1, b1;\
1099
        a0= pixels[0];\
1100
        b0= pixels[1] + 2;\
1101
        a0 += b0;\
1102
        b0 += pixels[2];\
1103
\
1104
        pixels+=line_size;\
1105
        for(i=0; i<h; i+=2){\
1106
            a1= pixels[0];\
1107
            b1= pixels[1];\
1108
            a1 += b1;\
1109
            b1 += pixels[2];\
1110
\
1111
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1112
            block[1]= (b1+b0)>>2;\
1113
\
1114
            pixels+=line_size;\
1115
            block +=line_size;\
1116
\
1117
            a0= pixels[0];\
1118
            b0= pixels[1] + 2;\
1119
            a0 += b0;\
1120
            b0 += pixels[2];\
1121
\
1122
            block[0]= (a1+a0)>>2;\
1123
            block[1]= (b1+b0)>>2;\
1124
            pixels+=line_size;\
1125
            block +=line_size;\
1126
        }\
1127
}\
1128
\
1129
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1130
{\
1131
        int i;\
1132
        const uint32_t a= AV_RN32(pixels  );\
1133
        const uint32_t b= AV_RN32(pixels+1);\
1134
        uint32_t l0=  (a&0x03030303UL)\
1135
                    + (b&0x03030303UL)\
1136
                    + 0x02020202UL;\
1137
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138
                   + ((b&0xFCFCFCFCUL)>>2);\
1139
        uint32_t l1,h1;\
1140
\
1141
        pixels+=line_size;\
1142
        for(i=0; i<h; i+=2){\
1143
            uint32_t a= AV_RN32(pixels  );\
1144
            uint32_t b= AV_RN32(pixels+1);\
1145
            l1=  (a&0x03030303UL)\
1146
               + (b&0x03030303UL);\
1147
            h1= ((a&0xFCFCFCFCUL)>>2)\
1148
              + ((b&0xFCFCFCFCUL)>>2);\
1149
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150
            pixels+=line_size;\
1151
            block +=line_size;\
1152
            a= AV_RN32(pixels  );\
1153
            b= AV_RN32(pixels+1);\
1154
            l0=  (a&0x03030303UL)\
1155
               + (b&0x03030303UL)\
1156
               + 0x02020202UL;\
1157
            h0= ((a&0xFCFCFCFCUL)>>2)\
1158
              + ((b&0xFCFCFCFCUL)>>2);\
1159
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1160
            pixels+=line_size;\
1161
            block +=line_size;\
1162
        }\
1163
}\
1164
\
1165
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1166
{\
1167
    int j;\
1168
    for(j=0; j<2; j++){\
1169
        int i;\
1170
        const uint32_t a= AV_RN32(pixels  );\
1171
        const uint32_t b= AV_RN32(pixels+1);\
1172
        uint32_t l0=  (a&0x03030303UL)\
1173
                    + (b&0x03030303UL)\
1174
                    + 0x02020202UL;\
1175
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176
                   + ((b&0xFCFCFCFCUL)>>2);\
1177
        uint32_t l1,h1;\
1178
\
1179
        pixels+=line_size;\
1180
        for(i=0; i<h; i+=2){\
1181
            uint32_t a= AV_RN32(pixels  );\
1182
            uint32_t b= AV_RN32(pixels+1);\
1183
            l1=  (a&0x03030303UL)\
1184
               + (b&0x03030303UL);\
1185
            h1= ((a&0xFCFCFCFCUL)>>2)\
1186
              + ((b&0xFCFCFCFCUL)>>2);\
1187
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1188
            pixels+=line_size;\
1189
            block +=line_size;\
1190
            a= AV_RN32(pixels  );\
1191
            b= AV_RN32(pixels+1);\
1192
            l0=  (a&0x03030303UL)\
1193
               + (b&0x03030303UL)\
1194
               + 0x02020202UL;\
1195
            h0= ((a&0xFCFCFCFCUL)>>2)\
1196
              + ((b&0xFCFCFCFCUL)>>2);\
1197
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1198
            pixels+=line_size;\
1199
            block +=line_size;\
1200
        }\
1201
        pixels+=4-line_size*(h+1);\
1202
        block +=4-line_size*h;\
1203
    }\
1204
}\
1205
\
1206
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1207
{\
1208
    int j;\
1209
    for(j=0; j<2; j++){\
1210
        int i;\
1211
        const uint32_t a= AV_RN32(pixels  );\
1212
        const uint32_t b= AV_RN32(pixels+1);\
1213
        uint32_t l0=  (a&0x03030303UL)\
1214
                    + (b&0x03030303UL)\
1215
                    + 0x01010101UL;\
1216
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217
                   + ((b&0xFCFCFCFCUL)>>2);\
1218
        uint32_t l1,h1;\
1219
\
1220
        pixels+=line_size;\
1221
        for(i=0; i<h; i+=2){\
1222
            uint32_t a= AV_RN32(pixels  );\
1223
            uint32_t b= AV_RN32(pixels+1);\
1224
            l1=  (a&0x03030303UL)\
1225
               + (b&0x03030303UL);\
1226
            h1= ((a&0xFCFCFCFCUL)>>2)\
1227
              + ((b&0xFCFCFCFCUL)>>2);\
1228
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1229
            pixels+=line_size;\
1230
            block +=line_size;\
1231
            a= AV_RN32(pixels  );\
1232
            b= AV_RN32(pixels+1);\
1233
            l0=  (a&0x03030303UL)\
1234
               + (b&0x03030303UL)\
1235
               + 0x01010101UL;\
1236
            h0= ((a&0xFCFCFCFCUL)>>2)\
1237
              + ((b&0xFCFCFCFCUL)>>2);\
1238
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1239
            pixels+=line_size;\
1240
            block +=line_size;\
1241
        }\
1242
        pixels+=4-line_size*(h+1);\
1243
        block +=4-line_size*h;\
1244
    }\
1245
}\
1246
\
1247
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1248
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1252
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1255

    
1256
#define op_avg(a, b) a = rnd_avg32(a, b)
1257
#endif
1258
#define op_put(a, b) a = b
1259

    
1260
PIXOP2(avg, op_avg)
1261
PIXOP2(put, op_put)
1262
#undef op_avg
1263
#undef op_put
1264

    
1265
#define avg2(a,b) ((a+b+1)>>1)
1266
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1267

    
1268
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1270
}
1271

    
1272
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1274
}
1275

    
1276
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1277
{
1278
    const int A=(16-x16)*(16-y16);
1279
    const int B=(   x16)*(16-y16);
1280
    const int C=(16-x16)*(   y16);
1281
    const int D=(   x16)*(   y16);
1282
    int i;
1283

    
1284
    for(i=0; i<h; i++)
1285
    {
1286
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1294
        dst+= stride;
1295
        src+= stride;
1296
    }
1297
}
1298

    
1299
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1301
{
1302
    int y, vx, vy;
1303
    const int s= 1<<shift;
1304

    
1305
    width--;
1306
    height--;
1307

    
1308
    for(y=0; y<h; y++){
1309
        int x;
1310

    
1311
        vx= ox;
1312
        vy= oy;
1313
        for(x=0; x<8; x++){ //XXX FIXME optimize
1314
            int src_x, src_y, frac_x, frac_y, index;
1315

    
1316
            src_x= vx>>16;
1317
            src_y= vy>>16;
1318
            frac_x= src_x&(s-1);
1319
            frac_y= src_y&(s-1);
1320
            src_x>>=shift;
1321
            src_y>>=shift;
1322

    
1323
            if((unsigned)src_x < width){
1324
                if((unsigned)src_y < height){
1325
                    index= src_x + src_y*stride;
1326
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1327
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1328
                                        + (  src[index+stride  ]*(s-frac_x)
1329
                                           + src[index+stride+1]*   frac_x )*   frac_y
1330
                                        + r)>>(shift*2);
1331
                }else{
1332
                    index= src_x + av_clip(src_y, 0, height)*stride;
1333
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1334
                                          + src[index       +1]*   frac_x )*s
1335
                                        + r)>>(shift*2);
1336
                }
1337
            }else{
1338
                if((unsigned)src_y < height){
1339
                    index= av_clip(src_x, 0, width) + src_y*stride;
1340
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1341
                                           + src[index+stride  ]*   frac_y )*s
1342
                                        + r)>>(shift*2);
1343
                }else{
1344
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345
                    dst[y*stride + x]=    src[index         ];
1346
                }
1347
            }
1348

    
1349
            vx+= dxx;
1350
            vy+= dyx;
1351
        }
1352
        ox += dxy;
1353
        oy += dyy;
1354
    }
1355
}
1356

    
1357
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358
    switch(width){
1359
    case 2: put_pixels2_c (dst, src, stride, height); break;
1360
    case 4: put_pixels4_c (dst, src, stride, height); break;
1361
    case 8: put_pixels8_c (dst, src, stride, height); break;
1362
    case 16:put_pixels16_c(dst, src, stride, height); break;
1363
    }
1364
}
1365

    
1366
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367
    int i,j;
1368
    for (i=0; i < height; i++) {
1369
      for (j=0; j < width; j++) {
1370
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1371
      }
1372
      src += stride;
1373
      dst += stride;
1374
    }
1375
}
1376

    
1377
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378
    int i,j;
1379
    for (i=0; i < height; i++) {
1380
      for (j=0; j < width; j++) {
1381
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1382
      }
1383
      src += stride;
1384
      dst += stride;
1385
    }
1386
}
1387

    
1388
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389
    int i,j;
1390
    for (i=0; i < height; i++) {
1391
      for (j=0; j < width; j++) {
1392
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1393
      }
1394
      src += stride;
1395
      dst += stride;
1396
    }
1397
}
1398

    
1399
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400
    int i,j;
1401
    for (i=0; i < height; i++) {
1402
      for (j=0; j < width; j++) {
1403
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1404
      }
1405
      src += stride;
1406
      dst += stride;
1407
    }
1408
}
1409

    
1410
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411
    int i,j;
1412
    for (i=0; i < height; i++) {
1413
      for (j=0; j < width; j++) {
1414
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1415
      }
1416
      src += stride;
1417
      dst += stride;
1418
    }
1419
}
1420

    
1421
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422
    int i,j;
1423
    for (i=0; i < height; i++) {
1424
      for (j=0; j < width; j++) {
1425
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1426
      }
1427
      src += stride;
1428
      dst += stride;
1429
    }
1430
}
1431

    
1432
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433
    int i,j;
1434
    for (i=0; i < height; i++) {
1435
      for (j=0; j < width; j++) {
1436
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1437
      }
1438
      src += stride;
1439
      dst += stride;
1440
    }
1441
}
1442

    
1443
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444
    int i,j;
1445
    for (i=0; i < height; i++) {
1446
      for (j=0; j < width; j++) {
1447
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1448
      }
1449
      src += stride;
1450
      dst += stride;
1451
    }
1452
}
1453

    
1454
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455
    switch(width){
1456
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1457
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1458
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1459
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1460
    }
1461
}
1462

    
1463
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464
    int i,j;
1465
    for (i=0; i < height; i++) {
1466
      for (j=0; j < width; j++) {
1467
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1468
      }
1469
      src += stride;
1470
      dst += stride;
1471
    }
1472
}
1473

    
1474
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475
    int i,j;
1476
    for (i=0; i < height; i++) {
1477
      for (j=0; j < width; j++) {
1478
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1479
      }
1480
      src += stride;
1481
      dst += stride;
1482
    }
1483
}
1484

    
1485
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1486
    int i,j;
1487
    for (i=0; i < height; i++) {
1488
      for (j=0; j < width; j++) {
1489
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1490
      }
1491
      src += stride;
1492
      dst += stride;
1493
    }
1494
}
1495

    
1496
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1497
    int i,j;
1498
    for (i=0; i < height; i++) {
1499
      for (j=0; j < width; j++) {
1500
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1501
      }
1502
      src += stride;
1503
      dst += stride;
1504
    }
1505
}
1506

    
1507
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1508
    int i,j;
1509
    for (i=0; i < height; i++) {
1510
      for (j=0; j < width; j++) {
1511
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1512
      }
1513
      src += stride;
1514
      dst += stride;
1515
    }
1516
}
1517

    
1518
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1519
    int i,j;
1520
    for (i=0; i < height; i++) {
1521
      for (j=0; j < width; j++) {
1522
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1523
      }
1524
      src += stride;
1525
      dst += stride;
1526
    }
1527
}
1528

    
1529
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530
    int i,j;
1531
    for (i=0; i < height; i++) {
1532
      for (j=0; j < width; j++) {
1533
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1534
      }
1535
      src += stride;
1536
      dst += stride;
1537
    }
1538
}
1539

    
1540
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541
    int i,j;
1542
    for (i=0; i < height; i++) {
1543
      for (j=0; j < width; j++) {
1544
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1545
      }
1546
      src += stride;
1547
      dst += stride;
1548
    }
1549
}
1550
#if 0
1551
#define TPEL_WIDTH(width)\
1552
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1570
#endif
1571

    
1572
#define H264_CHROMA_MC(OPNAME, OP)\
1573
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574
    const int A=(8-x)*(8-y);\
1575
    const int B=(  x)*(8-y);\
1576
    const int C=(8-x)*(  y);\
1577
    const int D=(  x)*(  y);\
1578
    int i;\
1579
    \
1580
    assert(x<8 && y<8 && x>=0 && y>=0);\
1581
\
1582
    if(D){\
1583
        for(i=0; i<h; i++){\
1584
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1586
            dst+= stride;\
1587
            src+= stride;\
1588
        }\
1589
    }else{\
1590
        const int E= B+C;\
1591
        const int step= C ? stride : 1;\
1592
        for(i=0; i<h; i++){\
1593
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1594
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1595
            dst+= stride;\
1596
            src+= stride;\
1597
        }\
1598
    }\
1599
}\
1600
\
1601
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602
    const int A=(8-x)*(8-y);\
1603
    const int B=(  x)*(8-y);\
1604
    const int C=(8-x)*(  y);\
1605
    const int D=(  x)*(  y);\
1606
    int i;\
1607
    \
1608
    assert(x<8 && y<8 && x>=0 && y>=0);\
1609
\
1610
    if(D){\
1611
        for(i=0; i<h; i++){\
1612
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1616
            dst+= stride;\
1617
            src+= stride;\
1618
        }\
1619
    }else{\
1620
        const int E= B+C;\
1621
        const int step= C ? stride : 1;\
1622
        for(i=0; i<h; i++){\
1623
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1624
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1625
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1626
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1627
            dst+= stride;\
1628
            src+= stride;\
1629
        }\
1630
    }\
1631
}\
1632
\
1633
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634
    const int A=(8-x)*(8-y);\
1635
    const int B=(  x)*(8-y);\
1636
    const int C=(8-x)*(  y);\
1637
    const int D=(  x)*(  y);\
1638
    int i;\
1639
    \
1640
    assert(x<8 && y<8 && x>=0 && y>=0);\
1641
\
1642
    if(D){\
1643
        for(i=0; i<h; i++){\
1644
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1652
            dst+= stride;\
1653
            src+= stride;\
1654
        }\
1655
    }else{\
1656
        const int E= B+C;\
1657
        const int step= C ? stride : 1;\
1658
        for(i=0; i<h; i++){\
1659
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1660
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1661
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1662
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1663
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1664
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1665
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1666
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1667
            dst+= stride;\
1668
            src+= stride;\
1669
        }\
1670
    }\
1671
}
1672

    
1673
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674
#define op_put(a, b) a = (((b) + 32)>>6)
1675

    
1676
H264_CHROMA_MC(put_       , op_put)
1677
H264_CHROMA_MC(avg_       , op_avg)
1678
#undef op_avg
1679
#undef op_put
1680

    
1681
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682
    const int A=(8-x)*(8-y);
1683
    const int B=(  x)*(8-y);
1684
    const int C=(8-x)*(  y);
1685
    const int D=(  x)*(  y);
1686
    int i;
1687

    
1688
    assert(x<8 && y<8 && x>=0 && y>=0);
1689

    
1690
    for(i=0; i<h; i++)
1691
    {
1692
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1700
        dst+= stride;
1701
        src+= stride;
1702
    }
1703
}
1704

    
1705
#define QPEL_MC(r, OPNAME, RND, OP) \
1706
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1708
    int i;\
1709
    for(i=0; i<h; i++)\
1710
    {\
1711
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1719
        dst+=dstStride;\
1720
        src+=srcStride;\
1721
    }\
1722
}\
1723
\
1724
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1725
    const int w=8;\
1726
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727
    int i;\
1728
    for(i=0; i<w; i++)\
1729
    {\
1730
        const int src0= src[0*srcStride];\
1731
        const int src1= src[1*srcStride];\
1732
        const int src2= src[2*srcStride];\
1733
        const int src3= src[3*srcStride];\
1734
        const int src4= src[4*srcStride];\
1735
        const int src5= src[5*srcStride];\
1736
        const int src6= src[6*srcStride];\
1737
        const int src7= src[7*srcStride];\
1738
        const int src8= src[8*srcStride];\
1739
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1747
        dst++;\
1748
        src++;\
1749
    }\
1750
}\
1751
\
1752
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754
    int i;\
1755
    \
1756
    for(i=0; i<h; i++)\
1757
    {\
1758
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1774
        dst+=dstStride;\
1775
        src+=srcStride;\
1776
    }\
1777
}\
1778
\
1779
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781
    int i;\
1782
    const int w=16;\
1783
    for(i=0; i<w; i++)\
1784
    {\
1785
        const int src0= src[0*srcStride];\
1786
        const int src1= src[1*srcStride];\
1787
        const int src2= src[2*srcStride];\
1788
        const int src3= src[3*srcStride];\
1789
        const int src4= src[4*srcStride];\
1790
        const int src5= src[5*srcStride];\
1791
        const int src6= src[6*srcStride];\
1792
        const int src7= src[7*srcStride];\
1793
        const int src8= src[8*srcStride];\
1794
        const int src9= src[9*srcStride];\
1795
        const int src10= src[10*srcStride];\
1796
        const int src11= src[11*srcStride];\
1797
        const int src12= src[12*srcStride];\
1798
        const int src13= src[13*srcStride];\
1799
        const int src14= src[14*srcStride];\
1800
        const int src15= src[15*srcStride];\
1801
        const int src16= src[16*srcStride];\
1802
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1818
        dst++;\
1819
        src++;\
1820
    }\
1821
}\
1822
\
1823
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1825
}\
1826
\
1827
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828
    uint8_t half[64];\
1829
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1831
}\
1832
\
1833
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1835
}\
1836
\
1837
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838
    uint8_t half[64];\
1839
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1841
}\
1842
\
1843
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t full[16*9];\
1845
    uint8_t half[64];\
1846
    copy_block9(full, src, 16, stride, 9);\
1847
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1849
}\
1850
\
1851
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852
    uint8_t full[16*9];\
1853
    copy_block9(full, src, 16, stride, 9);\
1854
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1855
}\
1856
\
1857
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858
    uint8_t full[16*9];\
1859
    uint8_t half[64];\
1860
    copy_block9(full, src, 16, stride, 9);\
1861
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1863
}\
1864
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865
    uint8_t full[16*9];\
1866
    uint8_t halfH[72];\
1867
    uint8_t halfV[64];\
1868
    uint8_t halfHV[64];\
1869
    copy_block9(full, src, 16, stride, 9);\
1870
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874
}\
1875
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876
    uint8_t full[16*9];\
1877
    uint8_t halfH[72];\
1878
    uint8_t halfHV[64];\
1879
    copy_block9(full, src, 16, stride, 9);\
1880
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1884
}\
1885
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886
    uint8_t full[16*9];\
1887
    uint8_t halfH[72];\
1888
    uint8_t halfV[64];\
1889
    uint8_t halfHV[64];\
1890
    copy_block9(full, src, 16, stride, 9);\
1891
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1895
}\
1896
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897
    uint8_t full[16*9];\
1898
    uint8_t halfH[72];\
1899
    uint8_t halfHV[64];\
1900
    copy_block9(full, src, 16, stride, 9);\
1901
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1905
}\
1906
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907
    uint8_t full[16*9];\
1908
    uint8_t halfH[72];\
1909
    uint8_t halfV[64];\
1910
    uint8_t halfHV[64];\
1911
    copy_block9(full, src, 16, stride, 9);\
1912
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1916
}\
1917
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918
    uint8_t full[16*9];\
1919
    uint8_t halfH[72];\
1920
    uint8_t halfHV[64];\
1921
    copy_block9(full, src, 16, stride, 9);\
1922
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1926
}\
1927
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t full[16*9];\
1929
    uint8_t halfH[72];\
1930
    uint8_t halfV[64];\
1931
    uint8_t halfHV[64];\
1932
    copy_block9(full, src, 16, stride, 9);\
1933
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1934
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1937
}\
1938
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939
    uint8_t full[16*9];\
1940
    uint8_t halfH[72];\
1941
    uint8_t halfHV[64];\
1942
    copy_block9(full, src, 16, stride, 9);\
1943
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1947
}\
1948
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t halfH[72];\
1950
    uint8_t halfHV[64];\
1951
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1954
}\
1955
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t halfH[72];\
1957
    uint8_t halfHV[64];\
1958
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1961
}\
1962
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963
    uint8_t full[16*9];\
1964
    uint8_t halfH[72];\
1965
    uint8_t halfV[64];\
1966
    uint8_t halfHV[64];\
1967
    copy_block9(full, src, 16, stride, 9);\
1968
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1972
}\
1973
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974
    uint8_t full[16*9];\
1975
    uint8_t halfH[72];\
1976
    copy_block9(full, src, 16, stride, 9);\
1977
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1980
}\
1981
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982
    uint8_t full[16*9];\
1983
    uint8_t halfH[72];\
1984
    uint8_t halfV[64];\
1985
    uint8_t halfHV[64];\
1986
    copy_block9(full, src, 16, stride, 9);\
1987
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1991
}\
1992
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993
    uint8_t full[16*9];\
1994
    uint8_t halfH[72];\
1995
    copy_block9(full, src, 16, stride, 9);\
1996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1999
}\
2000
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t halfH[72];\
2002
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2004
}\
2005
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2007
}\
2008
\
2009
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2010
    uint8_t half[256];\
2011
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2013
}\
2014
\
2015
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2017
}\
2018
\
2019
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2020
    uint8_t half[256];\
2021
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2023
}\
2024
\
2025
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026
    uint8_t full[24*17];\
2027
    uint8_t half[256];\
2028
    copy_block17(full, src, 24, stride, 17);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2031
}\
2032
\
2033
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034
    uint8_t full[24*17];\
2035
    copy_block17(full, src, 24, stride, 17);\
2036
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2037
}\
2038
\
2039
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040
    uint8_t full[24*17];\
2041
    uint8_t half[256];\
2042
    copy_block17(full, src, 24, stride, 17);\
2043
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2045
}\
2046
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047
    uint8_t full[24*17];\
2048
    uint8_t halfH[272];\
2049
    uint8_t halfV[256];\
2050
    uint8_t halfHV[256];\
2051
    copy_block17(full, src, 24, stride, 17);\
2052
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056
}\
2057
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[24*17];\
2059
    uint8_t halfH[272];\
2060
    uint8_t halfHV[256];\
2061
    copy_block17(full, src, 24, stride, 17);\
2062
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2066
}\
2067
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068
    uint8_t full[24*17];\
2069
    uint8_t halfH[272];\
2070
    uint8_t halfV[256];\
2071
    uint8_t halfHV[256];\
2072
    copy_block17(full, src, 24, stride, 17);\
2073
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2077
}\
2078
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079
    uint8_t full[24*17];\
2080
    uint8_t halfH[272];\
2081
    uint8_t halfHV[256];\
2082
    copy_block17(full, src, 24, stride, 17);\
2083
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2087
}\
2088
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089
    uint8_t full[24*17];\
2090
    uint8_t halfH[272];\
2091
    uint8_t halfV[256];\
2092
    uint8_t halfHV[256];\
2093
    copy_block17(full, src, 24, stride, 17);\
2094
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2098
}\
2099
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100
    uint8_t full[24*17];\
2101
    uint8_t halfH[272];\
2102
    uint8_t halfHV[256];\
2103
    copy_block17(full, src, 24, stride, 17);\
2104
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2108
}\
2109
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110
    uint8_t full[24*17];\
2111
    uint8_t halfH[272];\
2112
    uint8_t halfV[256];\
2113
    uint8_t halfHV[256];\
2114
    copy_block17(full, src, 24, stride, 17);\
2115
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2116
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2119
}\
2120
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121
    uint8_t full[24*17];\
2122
    uint8_t halfH[272];\
2123
    uint8_t halfHV[256];\
2124
    copy_block17(full, src, 24, stride, 17);\
2125
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2129
}\
2130
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131
    uint8_t halfH[272];\
2132
    uint8_t halfHV[256];\
2133
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2136
}\
2137
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t halfH[272];\
2139
    uint8_t halfHV[256];\
2140
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2143
}\
2144
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145
    uint8_t full[24*17];\
2146
    uint8_t halfH[272];\
2147
    uint8_t halfV[256];\
2148
    uint8_t halfHV[256];\
2149
    copy_block17(full, src, 24, stride, 17);\
2150
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2154
}\
2155
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156
    uint8_t full[24*17];\
2157
    uint8_t halfH[272];\
2158
    copy_block17(full, src, 24, stride, 17);\
2159
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2162
}\
2163
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164
    uint8_t full[24*17];\
2165
    uint8_t halfH[272];\
2166
    uint8_t halfV[256];\
2167
    uint8_t halfHV[256];\
2168
    copy_block17(full, src, 24, stride, 17);\
2169
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2173
}\
2174
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175
    uint8_t full[24*17];\
2176
    uint8_t halfH[272];\
2177
    copy_block17(full, src, 24, stride, 17);\
2178
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2181
}\
2182
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183
    uint8_t halfH[272];\
2184
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2186
}
2187

    
2188
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190
#define op_put(a, b) a = cm[((b) + 16)>>5]
2191
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2192

    
2193
QPEL_MC(0, put_       , _       , op_put)
2194
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195
QPEL_MC(0, avg_       , _       , op_avg)
2196
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2197
#undef op_avg
2198
#undef op_avg_no_rnd
2199
#undef op_put
2200
#undef op_put_no_rnd
2201

    
2202
#if 1
2203
#define H264_LOWPASS(OPNAME, OP, OP2) \
2204
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205
    const int h=2;\
2206
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207
    int i;\
2208
    for(i=0; i<h; i++)\
2209
    {\
2210
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2212
        dst+=dstStride;\
2213
        src+=srcStride;\
2214
    }\
2215
}\
2216
\
2217
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2218
    const int w=2;\
2219
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220
    int i;\
2221
    for(i=0; i<w; i++)\
2222
    {\
2223
        const int srcB= src[-2*srcStride];\
2224
        const int srcA= src[-1*srcStride];\
2225
        const int src0= src[0 *srcStride];\
2226
        const int src1= src[1 *srcStride];\
2227
        const int src2= src[2 *srcStride];\
2228
        const int src3= src[3 *srcStride];\
2229
        const int src4= src[4 *srcStride];\
2230
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2232
        dst++;\
2233
        src++;\
2234
    }\
2235
}\
2236
\
2237
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2238
    const int h=2;\
2239
    const int w=2;\
2240
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2241
    int i;\
2242
    src -= 2*srcStride;\
2243
    for(i=0; i<h+5; i++)\
2244
    {\
2245
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2247
        tmp+=tmpStride;\
2248
        src+=srcStride;\
2249
    }\
2250
    tmp -= tmpStride*(h+5-2);\
2251
    for(i=0; i<w; i++)\
2252
    {\
2253
        const int tmpB= tmp[-2*tmpStride];\
2254
        const int tmpA= tmp[-1*tmpStride];\
2255
        const int tmp0= tmp[0 *tmpStride];\
2256
        const int tmp1= tmp[1 *tmpStride];\
2257
        const int tmp2= tmp[2 *tmpStride];\
2258
        const int tmp3= tmp[3 *tmpStride];\
2259
        const int tmp4= tmp[4 *tmpStride];\
2260
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2262
        dst++;\
2263
        tmp++;\
2264
    }\
2265
}\
2266
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267
    const int h=4;\
2268
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269
    int i;\
2270
    for(i=0; i<h; i++)\
2271
    {\
2272
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2276
        dst+=dstStride;\
2277
        src+=srcStride;\
2278
    }\
2279
}\
2280
\
2281
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282
    const int w=4;\
2283
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284
    int i;\
2285
    for(i=0; i<w; i++)\
2286
    {\
2287
        const int srcB= src[-2*srcStride];\
2288
        const int srcA= src[-1*srcStride];\
2289
        const int src0= src[0 *srcStride];\
2290
        const int src1= src[1 *srcStride];\
2291
        const int src2= src[2 *srcStride];\
2292
        const int src3= src[3 *srcStride];\
2293
        const int src4= src[4 *srcStride];\
2294
        const int src5= src[5 *srcStride];\
2295
        const int src6= src[6 *srcStride];\
2296
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2300
        dst++;\
2301
        src++;\
2302
    }\
2303
}\
2304
\
2305
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2306
    const int h=4;\
2307
    const int w=4;\
2308
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309
    int i;\
2310
    src -= 2*srcStride;\
2311
    for(i=0; i<h+5; i++)\
2312
    {\
2313
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2317
        tmp+=tmpStride;\
2318
        src+=srcStride;\
2319
    }\
2320
    tmp -= tmpStride*(h+5-2);\
2321
    for(i=0; i<w; i++)\
2322
    {\
2323
        const int tmpB= tmp[-2*tmpStride];\
2324
        const int tmpA= tmp[-1*tmpStride];\
2325
        const int tmp0= tmp[0 *tmpStride];\
2326
        const int tmp1= tmp[1 *tmpStride];\
2327
        const int tmp2= tmp[2 *tmpStride];\
2328
        const int tmp3= tmp[3 *tmpStride];\
2329
        const int tmp4= tmp[4 *tmpStride];\
2330
        const int tmp5= tmp[5 *tmpStride];\
2331
        const int tmp6= tmp[6 *tmpStride];\
2332
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2336
        dst++;\
2337
        tmp++;\
2338
    }\
2339
}\
2340
\
2341
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342
    const int h=8;\
2343
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2344
    int i;\
2345
    for(i=0; i<h; i++)\
2346
    {\
2347
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2355
        dst+=dstStride;\
2356
        src+=srcStride;\
2357
    }\
2358
}\
2359
\
2360
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2361
    const int w=8;\
2362
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2363
    int i;\
2364
    for(i=0; i<w; i++)\
2365
    {\
2366
        const int srcB= src[-2*srcStride];\
2367
        const int srcA= src[-1*srcStride];\
2368
        const int src0= src[0 *srcStride];\
2369
        const int src1= src[1 *srcStride];\
2370
        const int src2= src[2 *srcStride];\
2371
        const int src3= src[3 *srcStride];\
2372
        const int src4= src[4 *srcStride];\
2373
        const int src5= src[5 *srcStride];\
2374
        const int src6= src[6 *srcStride];\
2375
        const int src7= src[7 *srcStride];\
2376
        const int src8= src[8 *srcStride];\
2377
        const int src9= src[9 *srcStride];\
2378
        const int src10=src[10*srcStride];\
2379
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2387
        dst++;\
2388
        src++;\
2389
    }\
2390
}\
2391
\
2392
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2393
    const int h=8;\
2394
    const int w=8;\
2395
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2396
    int i;\
2397
    src -= 2*srcStride;\
2398
    for(i=0; i<h+5; i++)\
2399
    {\
2400
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2408
        tmp+=tmpStride;\
2409
        src+=srcStride;\
2410
    }\
2411
    tmp -= tmpStride*(h+5-2);\
2412
    for(i=0; i<w; i++)\
2413
    {\
2414
        const int tmpB= tmp[-2*tmpStride];\
2415
        const int tmpA= tmp[-1*tmpStride];\
2416
        const int tmp0= tmp[0 *tmpStride];\
2417
        const int tmp1= tmp[1 *tmpStride];\
2418
        const int tmp2= tmp[2 *tmpStride];\
2419
        const int tmp3= tmp[3 *tmpStride];\
2420
        const int tmp4= tmp[4 *tmpStride];\
2421
        const int tmp5= tmp[5 *tmpStride];\
2422
        const int tmp6= tmp[6 *tmpStride];\
2423
        const int tmp7= tmp[7 *tmpStride];\
2424
        const int tmp8= tmp[8 *tmpStride];\
2425
        const int tmp9= tmp[9 *tmpStride];\
2426
        const int tmp10=tmp[10*tmpStride];\
2427
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2435
        dst++;\
2436
        tmp++;\
2437
    }\
2438
}\
2439
\
2440
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2442
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443
    src += 8*srcStride;\
2444
    dst += 8*dstStride;\
2445
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2446
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2447
}\
2448
\
2449
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2451
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452
    src += 8*srcStride;\
2453
    dst += 8*dstStride;\
2454
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2455
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2456
}\
2457
\
2458
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2460
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461
    src += 8*srcStride;\
2462
    dst += 8*dstStride;\
2463
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2464
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2465
}\
2466

    
2467
#define H264_MC(OPNAME, SIZE) \
2468
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    uint8_t half[SIZE*SIZE];\
2474
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2476
}\
2477
\
2478
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2480
}\
2481
\
2482
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483
    uint8_t half[SIZE*SIZE];\
2484
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2486
}\
2487
\
2488
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489
    uint8_t full[SIZE*(SIZE+5)];\
2490
    uint8_t * const full_mid= full + SIZE*2;\
2491
    uint8_t half[SIZE*SIZE];\
2492
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2493
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2495
}\
2496
\
2497
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498
    uint8_t full[SIZE*(SIZE+5)];\
2499
    uint8_t * const full_mid= full + SIZE*2;\
2500
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2501
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2502
}\
2503
\
2504
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505
    uint8_t full[SIZE*(SIZE+5)];\
2506
    uint8_t * const full_mid= full + SIZE*2;\
2507
    uint8_t half[SIZE*SIZE];\
2508
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2509
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2511
}\
2512
\
2513
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514
    uint8_t full[SIZE*(SIZE+5)];\
2515
    uint8_t * const full_mid= full + SIZE*2;\
2516
    uint8_t halfH[SIZE*SIZE];\
2517
    uint8_t halfV[SIZE*SIZE];\
2518
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2520
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2522
}\
2523
\
2524
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525
    uint8_t full[SIZE*(SIZE+5)];\
2526
    uint8_t * const full_mid= full + SIZE*2;\
2527
    uint8_t halfH[SIZE*SIZE];\
2528
    uint8_t halfV[SIZE*SIZE];\
2529
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2531
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2533
}\
2534
\
2535
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536
    uint8_t full[SIZE*(SIZE+5)];\
2537
    uint8_t * const full_mid= full + SIZE*2;\
2538
    uint8_t halfH[SIZE*SIZE];\
2539
    uint8_t halfV[SIZE*SIZE];\
2540
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2542
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2544
}\
2545
\
2546
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547
    uint8_t full[SIZE*(SIZE+5)];\
2548
    uint8_t * const full_mid= full + SIZE*2;\
2549
    uint8_t halfH[SIZE*SIZE];\
2550
    uint8_t halfV[SIZE*SIZE];\
2551
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2553
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2555
}\
2556
\
2557
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558
    int16_t tmp[SIZE*(SIZE+5)];\
2559
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2560
}\
2561
\
2562
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563
    int16_t tmp[SIZE*(SIZE+5)];\
2564
    uint8_t halfH[SIZE*SIZE];\
2565
    uint8_t halfHV[SIZE*SIZE];\
2566
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2569
}\
2570
\
2571
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572
    int16_t tmp[SIZE*(SIZE+5)];\
2573
    uint8_t halfH[SIZE*SIZE];\
2574
    uint8_t halfHV[SIZE*SIZE];\
2575
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2578
}\
2579
\
2580
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581
    uint8_t full[SIZE*(SIZE+5)];\
2582
    uint8_t * const full_mid= full + SIZE*2;\
2583
    int16_t tmp[SIZE*(SIZE+5)];\
2584
    uint8_t halfV[SIZE*SIZE];\
2585
    uint8_t halfHV[SIZE*SIZE];\
2586
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2587
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2590
}\
2591
\
2592
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593
    uint8_t full[SIZE*(SIZE+5)];\
2594
    uint8_t * const full_mid= full + SIZE*2;\
2595
    int16_t tmp[SIZE*(SIZE+5)];\
2596
    uint8_t halfV[SIZE*SIZE];\
2597
    uint8_t halfHV[SIZE*SIZE];\
2598
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2599
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2602
}\
2603

    
2604
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2607
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2609

    
2610
H264_LOWPASS(put_       , op_put, op2_put)
2611
H264_LOWPASS(avg_       , op_avg, op2_avg)
2612
H264_MC(put_, 2)
2613
H264_MC(put_, 4)
2614
H264_MC(put_, 8)
2615
H264_MC(put_, 16)
2616
H264_MC(avg_, 4)
2617
H264_MC(avg_, 8)
2618
H264_MC(avg_, 16)
2619

    
2620
#undef op_avg
2621
#undef op_put
2622
#undef op2_avg
2623
#undef op2_put
2624
#endif
2625

    
2626
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628
#define H264_WEIGHT(W,H) \
2629
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2630
    int y; \
2631
    offset <<= log2_denom; \
2632
    if(log2_denom) offset += 1<<(log2_denom-1); \
2633
    for(y=0; y<H; y++, block += stride){ \
2634
        op_scale1(0); \
2635
        op_scale1(1); \
2636
        if(W==2) continue; \
2637
        op_scale1(2); \
2638
        op_scale1(3); \
2639
        if(W==4) continue; \
2640
        op_scale1(4); \
2641
        op_scale1(5); \
2642
        op_scale1(6); \
2643
        op_scale1(7); \
2644
        if(W==8) continue; \
2645
        op_scale1(8); \
2646
        op_scale1(9); \
2647
        op_scale1(10); \
2648
        op_scale1(11); \
2649
        op_scale1(12); \
2650
        op_scale1(13); \
2651
        op_scale1(14); \
2652
        op_scale1(15); \
2653
    } \
2654
} \
2655
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2656
    int y; \
2657
    offset = ((offset + 1) | 1) << log2_denom; \
2658
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2659
        op_scale2(0); \
2660
        op_scale2(1); \
2661
        if(W==2) continue; \
2662
        op_scale2(2); \
2663
        op_scale2(3); \
2664
        if(W==4) continue; \
2665
        op_scale2(4); \
2666
        op_scale2(5); \
2667
        op_scale2(6); \
2668
        op_scale2(7); \
2669
        if(W==8) continue; \
2670
        op_scale2(8); \
2671
        op_scale2(9); \
2672
        op_scale2(10); \
2673
        op_scale2(11); \
2674
        op_scale2(12); \
2675
        op_scale2(13); \
2676
        op_scale2(14); \
2677
        op_scale2(15); \
2678
    } \
2679
}
2680

    
2681
H264_WEIGHT(16,16)
2682
H264_WEIGHT(16,8)
2683
H264_WEIGHT(8,16)
2684
H264_WEIGHT(8,8)
2685
H264_WEIGHT(8,4)
2686
H264_WEIGHT(4,8)
2687
H264_WEIGHT(4,4)
2688
H264_WEIGHT(4,2)
2689
H264_WEIGHT(2,4)
2690
H264_WEIGHT(2,2)
2691

    
2692
#undef op_scale1
2693
#undef op_scale2
2694
#undef H264_WEIGHT
2695

    
2696
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2698
    int i;
2699

    
2700
    for(i=0; i<h; i++){
2701
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2709
        dst+=dstStride;
2710
        src+=srcStride;
2711
    }
2712
}
2713

    
2714
#if CONFIG_CAVS_DECODER
2715
/* AVS specific */
2716
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2717

    
2718
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719
    put_pixels8_c(dst, src, stride, 8);
2720
}
2721
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722
    avg_pixels8_c(dst, src, stride, 8);
2723
}
2724
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725
    put_pixels16_c(dst, src, stride, 16);
2726
}
2727
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728
    avg_pixels16_c(dst, src, stride, 16);
2729
}
2730
#endif /* CONFIG_CAVS_DECODER */
2731

    
2732
#if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2733
/* VC-1 specific */
2734
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2735

    
2736
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737
    put_pixels8_c(dst, src, stride, 8);
2738
}
2739
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2740

    
2741
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2742

    
2743
/* H264 specific */
2744
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2745

    
2746
#if CONFIG_RV30_DECODER
2747
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2748
#endif /* CONFIG_RV30_DECODER */
2749

    
2750
#if CONFIG_RV40_DECODER
2751
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2752
    put_pixels16_xy2_c(dst, src, stride, 16);
2753
}
2754
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2755
    avg_pixels16_xy2_c(dst, src, stride, 16);
2756
}
2757
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2758
    put_pixels8_xy2_c(dst, src, stride, 8);
2759
}
2760
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2761
    avg_pixels8_xy2_c(dst, src, stride, 8);
2762
}
2763

    
2764
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2765
#endif /* CONFIG_RV40_DECODER */
2766

    
2767
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2768
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2769
    int i;
2770

    
2771
    for(i=0; i<w; i++){
2772
        const int src_1= src[ -srcStride];
2773
        const int src0 = src[0          ];
2774
        const int src1 = src[  srcStride];
2775
        const int src2 = src[2*srcStride];
2776
        const int src3 = src[3*srcStride];
2777
        const int src4 = src[4*srcStride];
2778
        const int src5 = src[5*srcStride];
2779
        const int src6 = src[6*srcStride];
2780
        const int src7 = src[7*srcStride];
2781
        const int src8 = src[8*srcStride];
2782
        const int src9 = src[9*srcStride];
2783
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2784
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2785
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2786
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2787
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2788
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2789
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2790
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2791
        src++;
2792
        dst++;
2793
    }
2794
}
2795

    
2796
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2797
    put_pixels8_c(dst, src, stride, 8);
2798
}
2799

    
2800
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2801
    uint8_t half[64];
2802
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2803
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2804
}
2805

    
2806
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2807
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2808
}
2809

    
2810
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2811
    uint8_t half[64];
2812
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2813
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2814
}
2815

    
2816
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2817
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2818
}
2819

    
2820
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2821
    uint8_t halfH[88];
2822
    uint8_t halfV[64];
2823
    uint8_t halfHV[64];
2824
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2825
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2826
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2827
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2828
}
2829
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2830
    uint8_t halfH[88];
2831
    uint8_t halfV[64];
2832
    uint8_t halfHV[64];
2833
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2834
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2835
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2836
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2837
}
2838
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2839
    uint8_t halfH[88];
2840
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2841
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2842
}
2843

    
2844
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2845
    if(CONFIG_ANY_H263) {
2846
    int x;
2847
    const int strength= ff_h263_loop_filter_strength[qscale];
2848

    
2849
    for(x=0; x<8; x++){
2850
        int d1, d2, ad1;
2851
        int p0= src[x-2*stride];
2852
        int p1= src[x-1*stride];
2853
        int p2= src[x+0*stride];
2854
        int p3= src[x+1*stride];
2855
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2856

    
2857
        if     (d<-2*strength) d1= 0;
2858
        else if(d<-  strength) d1=-2*strength - d;
2859
        else if(d<   strength) d1= d;
2860
        else if(d< 2*strength) d1= 2*strength - d;
2861
        else                   d1= 0;
2862

    
2863
        p1 += d1;
2864
        p2 -= d1;
2865
        if(p1&256) p1= ~(p1>>31);
2866
        if(p2&256) p2= ~(p2>>31);
2867

    
2868
        src[x-1*stride] = p1;
2869
        src[x+0*stride] = p2;
2870

    
2871
        ad1= FFABS(d1)>>1;
2872

    
2873
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2874

    
2875
        src[x-2*stride] = p0 - d2;
2876
        src[x+  stride] = p3 + d2;
2877
    }
2878
    }
2879
}
2880

    
2881
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2882
    if(CONFIG_ANY_H263) {
2883
    int y;
2884
    const int strength= ff_h263_loop_filter_strength[qscale];
2885

    
2886
    for(y=0; y<8; y++){
2887
        int d1, d2, ad1;
2888
        int p0= src[y*stride-2];
2889
        int p1= src[y*stride-1];
2890
        int p2= src[y*stride+0];
2891
        int p3= src[y*stride+1];
2892
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2893

    
2894
        if     (d<-2*strength) d1= 0;
2895
        else if(d<-  strength) d1=-2*strength - d;
2896
        else if(d<   strength) d1= d;
2897
        else if(d< 2*strength) d1= 2*strength - d;
2898
        else                   d1= 0;
2899

    
2900
        p1 += d1;
2901
        p2 -= d1;
2902
        if(p1&256) p1= ~(p1>>31);
2903
        if(p2&256) p2= ~(p2>>31);
2904

    
2905
        src[y*stride-1] = p1;
2906
        src[y*stride+0] = p2;
2907

    
2908
        ad1= FFABS(d1)>>1;
2909

    
2910
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2911

    
2912
        src[y*stride-2] = p0 - d2;
2913
        src[y*stride+1] = p3 + d2;
2914
    }
2915
    }
2916
}
2917

    
2918
static void h261_loop_filter_c(uint8_t *src, int stride){
2919
    int x,y,xy,yz;
2920
    int temp[64];
2921

    
2922
    for(x=0; x<8; x++){
2923
        temp[x      ] = 4*src[x           ];
2924
        temp[x + 7*8] = 4*src[x + 7*stride];
2925
    }
2926
    for(y=1; y<7; y++){
2927
        for(x=0; x<8; x++){
2928
            xy = y * stride + x;
2929
            yz = y * 8 + x;
2930
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2931
        }
2932
    }
2933

    
2934
    for(y=0; y<8; y++){
2935
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2936
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2937
        for(x=1; x<7; x++){
2938
            xy = y * stride + x;
2939
            yz = y * 8 + x;
2940
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2941
        }
2942
    }
2943
}
2944

    
2945
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2946
{
2947
    int i, d;
2948
    for( i = 0; i < 4; i++ ) {
2949
        if( tc0[i] < 0 ) {
2950
            pix += 4*ystride;
2951
            continue;
2952
        }
2953
        for( d = 0; d < 4; d++ ) {
2954
            const int p0 = pix[-1*xstride];
2955
            const int p1 = pix[-2*xstride];
2956
            const int p2 = pix[-3*xstride];
2957
            const int q0 = pix[0];
2958
            const int q1 = pix[1*xstride];
2959
            const int q2 = pix[2*xstride];
2960

    
2961
            if( FFABS( p0 - q0 ) < alpha &&
2962
                FFABS( p1 - p0 ) < beta &&
2963
                FFABS( q1 - q0 ) < beta ) {
2964

    
2965
                int tc = tc0[i];
2966
                int i_delta;
2967

    
2968
                if( FFABS( p2 - p0 ) < beta ) {
2969
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2970
                    tc++;
2971
                }
2972
                if( FFABS( q2 - q0 ) < beta ) {
2973
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2974
                    tc++;
2975
                }
2976

    
2977
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2978
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2979
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2980
            }
2981
            pix += ystride;
2982
        }
2983
    }
2984
}
2985
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2986
{
2987
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2988
}
2989
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2990
{
2991
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2992
}
2993

    
2994
static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2995
{
2996
    int d;
2997
    for( d = 0; d < 16; d++ ) {
2998
        const int p2 = pix[-3*xstride];
2999
        const int p1 = pix[-2*xstride];
3000
        const int p0 = pix[-1*xstride];
3001

    
3002
        const int q0 = pix[ 0*xstride];
3003
        const int q1 = pix[ 1*xstride];
3004
        const int q2 = pix[ 2*xstride];
3005

    
3006
        if( FFABS( p0 - q0 ) < alpha &&
3007
            FFABS( p1 - p0 ) < beta &&
3008
            FFABS( q1 - q0 ) < beta ) {
3009

    
3010
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3011
                if( FFABS( p2 - p0 ) < beta)
3012
                {
3013
                    const int p3 = pix[-4*xstride];
3014
                    /* p0', p1', p2' */
3015
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3016
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3017
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3018
                } else {
3019
                    /* p0' */
3020
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3021
                }
3022
                if( FFABS( q2 - q0 ) < beta)
3023
                {
3024
                    const int q3 = pix[3*xstride];
3025
                    /* q0', q1', q2' */
3026
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3027
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3028
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3029
                } else {
3030
                    /* q0' */
3031
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3032
                }
3033
            }else{
3034
                /* p0', q0' */
3035
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3036
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3037
            }
3038
        }
3039
        pix += ystride;
3040
    }
3041
}
3042
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3043
{
3044
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3045
}
3046
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3047
{
3048
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3049
}
3050

    
3051
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3052
{
3053
    int i, d;
3054
    for( i = 0; i < 4; i++ ) {
3055
        const int tc = tc0[i];
3056
        if( tc <= 0 ) {
3057
            pix += 2*ystride;
3058
            continue;
3059
        }
3060
        for( d = 0; d < 2; d++ ) {
3061
            const int p0 = pix[-1*xstride];
3062
            const int p1 = pix[-2*xstride];
3063
            const int q0 = pix[0];
3064
            const int q1 = pix[1*xstride];
3065

    
3066
            if( FFABS( p0 - q0 ) < alpha &&
3067
                FFABS( p1 - p0 ) < beta &&
3068
                FFABS( q1 - q0 ) < beta ) {
3069

    
3070
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3071

    
3072
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3073
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3074
            }
3075
            pix += ystride;
3076
        }
3077
    }
3078
}
3079
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3080
{
3081
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3082
}
3083
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3084
{
3085
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3086
}
3087

    
3088
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3089
{
3090
    int d;
3091
    for( d = 0; d < 8; d++ ) {
3092
        const int p0 = pix[-1*xstride];
3093
        const int p1 = pix[-2*xstride];
3094
        const int q0 = pix[0];
3095
        const int q1 = pix[1*xstride];
3096

    
3097
        if( FFABS( p0 - q0 ) < alpha &&
3098
            FFABS( p1 - p0 ) < beta &&
3099
            FFABS( q1 - q0 ) < beta ) {
3100

    
3101
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3102
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3103
        }
3104
        pix += ystride;
3105
    }
3106
}
3107
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3108
{
3109
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3110
}
3111
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3112
{
3113
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3114
}
3115

    
3116
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3117
{
3118
    int s, i;
3119

    
3120
    s = 0;
3121
    for(i=0;i<h;i++) {
3122
        s += abs(pix1[0] - pix2[0]);
3123
        s += abs(pix1[1] - pix2[1]);
3124
        s += abs(pix1[2] - pix2[2]);
3125
        s += abs(pix1[3] - pix2[3]);
3126
        s += abs(pix1[4] - pix2[4]);
3127
        s += abs(pix1[5] - pix2[5]);
3128
        s += abs(pix1[6] - pix2[6]);
3129
        s += abs(pix1[7] - pix2[7]);
3130
        s += abs(pix1[8] - pix2[8]);
3131
        s += abs(pix1[9] - pix2[9]);
3132
        s += abs(pix1[10] - pix2[10]);
3133
        s += abs(pix1[11] - pix2[11]);
3134
        s += abs(pix1[12] - pix2[12]);
3135
        s += abs(pix1[13] - pix2[13]);
3136
        s += abs(pix1[14] - pix2[14]);
3137
        s += abs(pix1[15] - pix2[15]);
3138
        pix1 += line_size;
3139
        pix2 += line_size;
3140
    }
3141
    return s;
3142
}
3143

    
3144
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3145
{
3146
    int s, i;
3147

    
3148
    s = 0;
3149
    for(i=0;i<h;i++) {
3150
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3151
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3152
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3153
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3154
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3155
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3156
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3157
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3158
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3159
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3160
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3161
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3162
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3163
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3164
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3165
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3166
        pix1 += line_size;
3167
        pix2 += line_size;
3168
    }
3169
    return s;
3170
}
3171

    
3172
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3173
{
3174
    int s, i;
3175
    uint8_t *pix3 = pix2 + line_size;
3176

    
3177
    s = 0;
3178
    for(i=0;i<h;i++) {
3179
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3180
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3181
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3182
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3183
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3184
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3185
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3186
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3187
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3188
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3189
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3190
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3191
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3192
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3193
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3194
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3195
        pix1 += line_size;
3196
        pix2 += line_size;
3197
        pix3 += line_size;
3198
    }
3199
    return s;
3200
}
3201

    
3202
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3203
{
3204
    int s, i;
3205
    uint8_t *pix3 = pix2 + line_size;
3206

    
3207
    s = 0;
3208
    for(i=0;i<h;i++) {
3209
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3210
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3211
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3212
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3213
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3214
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3215
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3216
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3217
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3218
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3219
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3220
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3221
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3222
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3223
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3224
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3225
        pix1 += line_size;
3226
        pix2 += line_size;
3227
        pix3 += line_size;
3228
    }
3229
    return s;
3230
}
3231

    
3232
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3233
{
3234
    int s, i;
3235

    
3236
    s = 0;
3237
    for(i=0;i<h;i++) {
3238
        s += abs(pix1[0] - pix2[0]);
3239
        s += abs(pix1[1] - pix2[1]);
3240
        s += abs(pix1[2] - pix2[2]);
3241
        s += abs(pix1[3] - pix2[3]);
3242
        s += abs(pix1[4] - pix2[4]);
3243
        s += abs(pix1[5] - pix2[5]);
3244
        s += abs(pix1[6] - pix2[6]);
3245
        s += abs(pix1[7] - pix2[7]);
3246
        pix1 += line_size;
3247
        pix2 += line_size;
3248
    }
3249
    return s;
3250
}
3251

    
3252
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3253
{
3254
    int s, i;
3255

    
3256
    s = 0;
3257
    for(i=0;i<h;i++) {
3258
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3259
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3260
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3261
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3262
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3263
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3264
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3265
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3266
        pix1 += line_size;
3267
        pix2 += line_size;
3268
    }
3269
    return s;
3270
}
3271

    
3272
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3273
{
3274
    int s, i;
3275
    uint8_t *pix3 = pix2 + line_size;
3276

    
3277
    s = 0;
3278
    for(i=0;i<h;i++) {
3279
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3280
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3281
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3282
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3283
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3284
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3285
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3286
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3287
        pix1 += line_size;
3288
        pix2 += line_size;
3289
        pix3 += line_size;
3290
    }
3291
    return s;
3292
}
3293

    
3294
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3295
{
3296
    int s, i;
3297
    uint8_t *pix3 = pix2 + line_size;
3298

    
3299
    s = 0;
3300
    for(i=0;i<h;i++) {
3301
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3302
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3303
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3304
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3305
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3306
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3307
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3308
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3309
        pix1 += line_size;
3310
        pix2 += line_size;
3311
        pix3 += line_size;
3312
    }
3313
    return s;
3314
}
3315

    
3316
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3317
    MpegEncContext *c = v;
3318
    int score1=0;
3319
    int score2=0;
3320
    int x,y;
3321

    
3322
    for(y=0; y<h; y++){
3323
        for(x=0; x<16; x++){
3324
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3325
        }
3326
        if(y+1<h){
3327
            for(x=0; x<15; x++){
3328
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3329
                             - s1[x+1] + s1[x+1+stride])
3330
                        -FFABS(  s2[x  ] - s2[x  +stride]
3331
                             - s2[x+1] + s2[x+1+stride]);
3332
            }
3333
        }
3334
        s1+= stride;
3335
        s2+= stride;
3336
    }
3337

    
3338
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3339
    else  return score1 + FFABS(score2)*8;
3340
}
3341

    
3342
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3343
    MpegEncContext *c = v;
3344
    int score1=0;
3345
    int score2=0;
3346
    int x,y;
3347

    
3348
    for(y=0; y<h; y++){
3349
        for(x=0; x<8; x++){
3350
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3351
        }
3352
        if(y+1<h){
3353
            for(x=0; x<7; x++){
3354
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3355
                             - s1[x+1] + s1[x+1+stride])
3356
                        -FFABS(  s2[x  ] - s2[x  +stride]
3357
                             - s2[x+1] + s2[x+1+stride]);
3358
            }
3359
        }
3360
        s1+= stride;
3361
        s2+= stride;
3362
    }
3363

    
3364
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3365
    else  return score1 + FFABS(score2)*8;
3366
}
3367

    
3368
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3369
    int i;
3370
    unsigned int sum=0;
3371

    
3372
    for(i=0; i<8*8; i++){
3373
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3374
        int w= weight[i];
3375
        b>>= RECON_SHIFT;
3376
        assert(-512<b && b<512);
3377

    
3378
        sum += (w*b)*(w*b)>>4;
3379
    }
3380
    return sum>>2;
3381
}
3382

    
3383
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3384
    int i;
3385

    
3386
    for(i=0; i<8*8; i++){
3387
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3388
    }
3389
}
3390

    
3391
/**
3392
 * permutes an 8x8 block.
3393
 * @param block the block which will be permuted according to the given permutation vector
3394
 * @param permutation the permutation vector
3395
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3396
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3397
 *                  (inverse) permutated to scantable order!
3398
 */
3399
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3400
{
3401
    int i;
3402
    DCTELEM temp[64];
3403

    
3404
    if(last<=0) return;
3405
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3406

    
3407
    for(i=0; i<=last; i++){
3408
        const int j= scantable[i];
3409
        temp[j]= block[j];
3410
        block[j]=0;
3411
    }
3412

    
3413
    for(i=0; i<=last; i++){
3414
        const int j= scantable[i];
3415
        const int perm_j= permutation[j];
3416
        block[perm_j]= temp[j];
3417
    }
3418
}
3419

    
3420
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3421
    return 0;
3422
}
3423

    
3424
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3425
    int i;
3426

    
3427
    memset(cmp, 0, sizeof(void*)*5);
3428

    
3429
    for(i=0; i<5; i++){
3430
        switch(type&0xFF){
3431
        case FF_CMP_SAD:
3432
            cmp[i]= c->sad[i];
3433
            break;
3434
        case FF_CMP_SATD:
3435
            cmp[i]= c->hadamard8_diff[i];
3436
            break;
3437
        case FF_CMP_SSE:
3438
            cmp[i]= c->sse[i];
3439
            break;
3440
        case FF_CMP_DCT:
3441
            cmp[i]= c->dct_sad[i];
3442
            break;
3443
        case FF_CMP_DCT264:
3444
            cmp[i]= c->dct264_sad[i];
3445
            break;
3446
        case FF_CMP_DCTMAX:
3447
            cmp[i]= c->dct_max[i];
3448
            break;
3449
        case FF_CMP_PSNR:
3450
            cmp[i]= c->quant_psnr[i];
3451
            break;
3452
        case FF_CMP_BIT:
3453
            cmp[i]= c->bit[i];
3454
            break;
3455
        case FF_CMP_RD:
3456
            cmp[i]= c->rd[i];
3457
            break;
3458
        case FF_CMP_VSAD:
3459
            cmp[i]= c->vsad[i];
3460
            break;
3461
        case FF_CMP_VSSE:
3462
            cmp[i]= c->vsse[i];
3463
            break;
3464
        case FF_CMP_ZERO:
3465
            cmp[i]= zero_cmp;
3466
            break;
3467
        case FF_CMP_NSSE:
3468
            cmp[i]= c->nsse[i];
3469
            break;
3470
#if CONFIG_SNOW_ENCODER
3471
        case FF_CMP_W53:
3472
            cmp[i]= c->w53[i];
3473
            break;
3474
        case FF_CMP_W97:
3475
            cmp[i]= c->w97[i];
3476
            break;
3477
#endif
3478
        default:
3479
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3480
        }
3481
    }
3482
}
3483

    
3484
static void clear_block_c(DCTELEM *block)
3485
{
3486
    memset(block, 0, sizeof(DCTELEM)*64);
3487
}
3488

    
3489
/**
3490
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3491
 */
3492
static void clear_blocks_c(DCTELEM *blocks)
3493
{
3494
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3495
}
3496

    
3497
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3498
    long i;
3499
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3500
        long a = *(long*)(src+i);
3501
        long b = *(long*)(dst+i);
3502
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3503
    }
3504
    for(; i<w; i++)
3505
        dst[i+0] += src[i+0];
3506
}
3507

    
3508
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3509
    long i;
3510
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3511
        long a = *(long*)(src1+i);
3512
        long b = *(long*)(src2+i);
3513
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3514
    }
3515
    for(; i<w; i++)
3516
        dst[i] = src1[i]+src2[i];
3517
}
3518

    
3519
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3520
    long i;
3521
#if !HAVE_FAST_UNALIGNED
3522
    if((long)src2 & (sizeof(long)-1)){
3523
        for(i=0; i+7<w; i+=8){
3524
            dst[i+0] = src1[i+0]-src2[i+0];
3525
            dst[i+1] = src1[i+1]-src2[i+1];
3526
            dst[i+2] = src1[i+2]-src2[i+2];
3527
            dst[i+3] = src1[i+3]-src2[i+3];
3528
            dst[i+4] = src1[i+4]-src2[i+4];
3529
            dst[i+5] = src1[i+5]-src2[i+5];
3530
            dst[i+6] = src1[i+6]-src2[i+6];
3531
            dst[i+7] = src1[i+7]-src2[i+7];
3532
        }
3533
    }else
3534
#endif
3535
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3536
        long a = *(long*)(src1+i);
3537
        long b = *(long*)(src2+i);
3538
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3539
    }
3540
    for(; i<w; i++)
3541
        dst[i+0] = src1[i+0]-src2[i+0];
3542
}
3543

    
3544
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3545
    int i;
3546
    uint8_t l, lt;
3547

    
3548
    l= *left;
3549
    lt= *left_top;
3550

    
3551
    for(i=0; i<w; i++){
3552
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3553
        lt= src1[i];
3554
        l= src2[i];
3555
        dst[i]= l - pred;
3556
    }
3557

    
3558
    *left= l;
3559
    *left_top= lt;
3560
}
3561

    
3562
#define BUTTERFLY2(o1,o2,i1,i2) \
3563
o1= (i1)+(i2);\
3564
o2= (i1)-(i2);
3565

    
3566
#define BUTTERFLY1(x,y) \
3567
{\
3568
    int a,b;\
3569
    a= x;\
3570
    b= y;\
3571
    x= a+b;\
3572
    y= a-b;\
3573
}
3574

    
3575
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3576

    
3577
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3578
    int i;
3579
    int temp[64];
3580
    int sum=0;
3581

    
3582
    assert(h==8);
3583

    
3584
    for(i=0; i<8; i++){
3585
        //FIXME try pointer walks
3586
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3587
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3588
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3589
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3590

    
3591
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3592
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3593
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3594
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3595

    
3596
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3597
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3598
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3599
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3600
    }
3601

    
3602
    for(i=0; i<8; i++){
3603
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3604
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3605
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3606
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3607

    
3608
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3609
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3610
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3611
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3612

    
3613
        sum +=
3614
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3615
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3616
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3617
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3618
    }
3619
#if 0
3620
static int maxi=0;
3621
if(sum>maxi){
3622
    maxi=sum;
3623
    printf("MAX:%d\n", maxi);
3624
}
3625
#endif
3626
    return sum;
3627
}
3628

    
3629
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3630
    int i;
3631
    int temp[64];
3632
    int sum=0;
3633

    
3634
    assert(h==8);
3635

    
3636
    for(i=0; i<8; i++){
3637
        //FIXME try pointer walks
3638
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3639
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3640
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3641
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3642

    
3643
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3644
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3645
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3646
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3647

    
3648
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3649
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3650
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3651
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3652
    }
3653

    
3654
    for(i=0; i<8; i++){
3655
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3656
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3657
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3658
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3659

    
3660
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3661
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3662
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3663
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3664

    
3665
        sum +=
3666
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3667
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3668
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3669
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3670
    }
3671

    
3672
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3673

    
3674
    return sum;
3675
}
3676

    
3677
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3678
    MpegEncContext * const s= (MpegEncContext *)c;
3679
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3680
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3681

    
3682
    assert(h==8);
3683

    
3684
    s->dsp.diff_pixels(temp, src1, src2, stride);
3685
    s->dsp.fdct(temp);
3686
    return s->dsp.sum_abs_dctelem(temp);
3687
}
3688

    
3689
#if CONFIG_GPL
3690
#define DCT8_1D {\
3691
    const int s07 = SRC(0) + SRC(7);\
3692
    const int s16 = SRC(1) + SRC(6);\
3693
    const int s25 = SRC(2) + SRC(5);\
3694
    const int s34 = SRC(3) + SRC(4);\
3695
    const int a0 = s07 + s34;\
3696
    const int a1 = s16 + s25;\
3697
    const int a2 = s07 - s34;\
3698
    const int a3 = s16 - s25;\
3699
    const int d07 = SRC(0) - SRC(7);\
3700
    const int d16 = SRC(1) - SRC(6);\
3701
    const int d25 = SRC(2) - SRC(5);\
3702
    const int d34 = SRC(3) - SRC(4);\
3703
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3704
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3705
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3706
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3707
    DST(0,  a0 + a1     ) ;\
3708
    DST(1,  a4 + (a7>>2)) ;\
3709
    DST(2,  a2 + (a3>>1)) ;\
3710
    DST(3,  a5 + (a6>>2)) ;\
3711
    DST(4,  a0 - a1     ) ;\
3712
    DST(5,  a6 - (a5>>2)) ;\
3713
    DST(6, (a2>>1) - a3 ) ;\
3714
    DST(7, (a4>>2) - a7 ) ;\
3715
}
3716

    
3717
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3718
    MpegEncContext * const s= (MpegEncContext *)c;
3719
    DCTELEM dct[8][8];
3720
    int i;
3721
    int sum=0;
3722

    
3723
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3724

    
3725
#define SRC(x) dct[i][x]
3726
#define DST(x,v) dct[i][x]= v
3727
    for( i = 0; i < 8; i++ )
3728
        DCT8_1D
3729
#undef SRC
3730
#undef DST
3731

    
3732
#define SRC(x) dct[x][i]
3733
#define DST(x,v) sum += FFABS(v)
3734
    for( i = 0; i < 8; i++ )
3735
        DCT8_1D
3736
#undef SRC
3737
#undef DST
3738
    return sum;
3739
}
3740
#endif
3741

    
3742
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3743
    MpegEncContext * const s= (MpegEncContext *)c;
3744
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3745
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3746
    int sum=0, i;
3747

    
3748
    assert(h==8);
3749

    
3750
    s->dsp.diff_pixels(temp, src1, src2, stride);
3751
    s->dsp.fdct(temp);
3752

    
3753
    for(i=0; i<64; i++)
3754
        sum= FFMAX(sum, FFABS(temp[i]));
3755

    
3756
    return sum;
3757
}
3758

    
3759
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3760
    MpegEncContext * const s= (MpegEncContext *)c;
3761
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3762
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3763
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3764
    int sum=0, i;
3765

    
3766
    assert(h==8);
3767
    s->mb_intra=0;
3768

    
3769
    s->dsp.diff_pixels(temp, src1, src2, stride);
3770

    
3771
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3772

    
3773
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3774
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3775
    ff_simple_idct(temp); //FIXME
3776

    
3777
    for(i=0; i<64; i++)
3778
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3779

    
3780
    return sum;
3781
}
3782

    
3783
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3784
    MpegEncContext * const s= (MpegEncContext *)c;
3785
    const uint8_t *scantable= s->intra_scantable.permutated;
3786
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3787
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3788
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3789
    uint8_t * const bak= (uint8_t*)aligned_bak;
3790
    int i, last, run, bits, level, distortion, start_i;
3791
    const int esc_length= s->ac_esc_length;
3792
    uint8_t * length;
3793
    uint8_t * last_length;
3794

    
3795
    assert(h==8);
3796

    
3797
    for(i=0; i<8; i++){
3798
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3799
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3800
    }
3801

    
3802
    s->dsp.diff_pixels(temp, src1, src2, stride);
3803

    
3804
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3805

    
3806
    bits=0;
3807

    
3808
    if (s->mb_intra) {
3809
        start_i = 1;
3810
        length     = s->intra_ac_vlc_length;
3811
        last_length= s->intra_ac_vlc_last_length;
3812
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3813
    } else {
3814
        start_i = 0;
3815
        length     = s->inter_ac_vlc_length;
3816
        last_length= s->inter_ac_vlc_last_length;
3817
    }
3818

    
3819
    if(last>=start_i){
3820
        run=0;
3821
        for(i=start_i; i<last; i++){
3822
            int j= scantable[i];
3823
            level= temp[j];
3824

    
3825
            if(level){
3826
                level+=64;
3827
                if((level&(~127)) == 0){
3828
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3829
                }else
3830
                    bits+= esc_length;
3831
                run=0;
3832
            }else
3833
                run++;
3834
        }
3835
        i= scantable[last];
3836

    
3837
        level= temp[i] + 64;
3838

    
3839
        assert(level - 64);
3840

    
3841
        if((level&(~127)) == 0){
3842
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3843
        }else
3844
            bits+= esc_length;
3845

    
3846
    }
3847

    
3848
    if(last>=0){
3849
        if(s->mb_intra)
3850
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3851
        else
3852
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3853
    }
3854

    
3855
    s->dsp.idct_add(bak, stride, temp);
3856

    
3857
    distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3858

    
3859
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3860
}
3861

    
3862
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3863
    MpegEncContext * const s= (MpegEncContext *)c;
3864
    const uint8_t *scantable= s->intra_scantable.permutated;
3865
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3866
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3867
    int i, last, run, bits, level, start_i;
3868
    const int esc_length= s->ac_esc_length;
3869
    uint8_t * length;
3870
    uint8_t * last_length;
3871

    
3872
    assert(h==8);
3873

    
3874
    s->dsp.diff_pixels(temp, src1, src2, stride);
3875

    
3876
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3877

    
3878
    bits=0;
3879

    
3880
    if (s->mb_intra) {
3881
        start_i = 1;
3882
        length     = s->intra_ac_vlc_length;
3883
        last_length= s->intra_ac_vlc_last_length;
3884
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3885
    } else {
3886
        start_i = 0;
3887
        length     = s->inter_ac_vlc_length;
3888
        last_length= s->inter_ac_vlc_last_length;
3889
    }
3890

    
3891
    if(last>=start_i){
3892
        run=0;
3893
        for(i=start_i; i<last; i++){
3894
            int j= scantable[i];
3895
            level= temp[j];
3896

    
3897
            if(level){
3898
                level+=64;
3899
                if((level&(~127)) == 0){
3900
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3901
                }else
3902
                    bits+= esc_length;
3903
                run=0;
3904
            }else
3905
                run++;
3906
        }
3907
        i= scantable[last];
3908

    
3909
        level= temp[i] + 64;
3910

    
3911
        assert(level - 64);
3912

    
3913
        if((level&(~127)) == 0){
3914
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3915
        }else
3916
            bits+= esc_length;
3917
    }
3918

    
3919
    return bits;
3920
}
3921

    
3922
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3923
    int score=0;
3924
    int x,y;
3925

    
3926
    for(y=1; y<h; y++){
3927
        for(x=0; x<16; x+=4){
3928
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3929
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3930
        }
3931
        s+= stride;
3932
    }
3933

    
3934
    return score;
3935
}
3936

    
3937
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3938
    int score=0;
3939
    int x,y;
3940

    
3941
    for(y=1; y<h; y++){
3942
        for(x=0; x<16; x++){
3943
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3944
        }
3945
        s1+= stride;
3946
        s2+= stride;
3947
    }
3948

    
3949
    return score;
3950
}
3951

    
3952
#define SQ(a) ((a)*(a))
3953
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3954
    int score=0;
3955
    int x,y;
3956

    
3957
    for(y=1; y<h; y++){
3958
        for(x=0; x<16; x+=4){
3959
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3960
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3961
        }
3962
        s+= stride;
3963
    }
3964

    
3965
    return score;
3966
}
3967

    
3968
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3969
    int score=0;
3970
    int x,y;
3971

    
3972
    for(y=1; y<h; y++){
3973
        for(x=0; x<16; x++){
3974
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3975
        }
3976
        s1+= stride;
3977
        s2+= stride;
3978
    }
3979

    
3980
    return score;
3981
}
3982

    
3983
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3984
                               int size){
3985
    int score=0;
3986
    int i;
3987
    for(i=0; i<size; i++)
3988
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3989
    return score;
3990
}
3991

    
3992
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3993
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3994
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3995
#if CONFIG_GPL
3996
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3997
#endif
3998
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3999
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4000
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4001
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4002

    
4003
static void vector_fmul_c(float *dst, const float *src, int len){
4004
    int i;
4005
    for(i=0; i<len; i++)
4006
        dst[i] *= src[i];
4007
}
4008

    
4009
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4010
    int i;
4011
    src1 += len-1;
4012
    for(i=0; i<len; i++)
4013
        dst[i] = src0[i] * src1[-i];
4014
}
4015

    
4016
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4017
    int i;
4018
    for(i=0; i<len; i++)
4019
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4020
}
4021

    
4022
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4023
    int i,j;
4024
    dst += len;
4025
    win += len;
4026
    src0+= len;
4027
    for(i=-len, j=len-1; i<0; i++, j--) {
4028
        float s0 = src0[i];
4029
        float s1 = src1[j];
4030
        float wi = win[i];
4031
        float wj = win[j];
4032
        dst[i] = s0*wj - s1*wi + add_bias;
4033
        dst[j] = s0*wi + s1*wj + add_bias;
4034
    }
4035
}
4036

    
4037
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4038
    int i;
4039
    for(i=0; i<len; i++)
4040
        dst[i] = src[i] * mul;
4041
}
4042

    
4043
static av_always_inline int float_to_int16_one(const float *src){
4044
    int_fast32_t tmp = *(const int32_t*)src;
4045
    if(tmp & 0xf0000){
4046
        tmp = (0x43c0ffff - tmp)>>31;
4047
        // is this faster on some gcc/cpu combinations?
4048
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4049
//      else                 tmp = 0;
4050
    }
4051
    return tmp - 0x8000;
4052
}
4053

    
4054
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4055
    int i;
4056
    for(i=0; i<len; i++)
4057
        dst[i] = float_to_int16_one(src+i);
4058
}
4059

    
4060
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4061
    int i,j,c;
4062
    if(channels==2){
4063
        for(i=0; i<len; i++){
4064
            dst[2*i]   = float_to_int16_one(src[0]+i);
4065
            dst[2*i+1] = float_to_int16_one(src[1]+i);
4066
        }
4067
    }else{
4068
        for(c=0; c<channels; c++)
4069
            for(i=0, j=c; i<len; i++, j+=channels)
4070
                dst[j] = float_to_int16_one(src[c]+i);
4071
    }
4072
}
4073

    
4074
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4075
{
4076
    while (order--)
4077
       *v1++ += *v2++;
4078
}
4079

    
4080
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4081
{
4082
    while (order--)
4083
        *v1++ -= *v2++;
4084
}
4085

    
4086
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4087
{
4088
    int res = 0;
4089

    
4090
    while (order--)
4091
        res += (*v1++ * *v2++) >> shift;
4092

    
4093
    return res;
4094
}
4095

    
4096
#define W0 2048
4097
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4098
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4099
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4100
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4101
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4102
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4103
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4104

    
4105
static void wmv2_idct_row(short * b)
4106
{
4107
    int s1,s2;
4108
    int a0,a1,a2,a3,a4,a5,a6,a7;
4109
    /*step 1*/
4110
    a1 = W1*b[1]+W7*b[7];
4111
    a7 = W7*b[1]-W1*b[7];
4112
    a5 = W5*b[5]+W3*b[3];
4113
    a3 = W3*b[5]-W5*b[3];
4114
    a2 = W2*b[2]+W6*b[6];
4115
    a6 = W6*b[2]-W2*b[6];
4116
    a0 = W0*b[0]+W0*b[4];
4117
    a4 = W0*b[0]-W0*b[4];
4118
    /*step 2*/
4119
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4120
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4121
    /*step 3*/
4122
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4123
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4124
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4125
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4126
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4127
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4128
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4129
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4130
}
4131
static void wmv2_idct_col(short * b)
4132
{
4133
    int s1,s2;
4134
    int a0,a1,a2,a3,a4,a5,a6,a7;
4135
    /*step 1, with extended precision*/
4136
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4137
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4138
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4139
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4140
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4141
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4142
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4143
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4144
    /*step 2*/
4145
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4146
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4147
    /*step 3*/
4148
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4149
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4150
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4151
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4152

    
4153
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4154
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4155
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4156
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4157
}
4158
void ff_wmv2_idct_c(short * block){
4159
    int i;
4160

    
4161
    for(i=0;i<64;i+=8){
4162
        wmv2_idct_row(block+i);
4163
    }
4164
    for(i=0;i<8;i++){
4165
        wmv2_idct_col(block+i);
4166
    }
4167
}
4168
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4169
 converted */
4170
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4171
{
4172
    ff_wmv2_idct_c(block);
4173
    put_pixels_clamped_c(block, dest, line_size);
4174
}
4175
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4176
{
4177
    ff_wmv2_idct_c(block);
4178
    add_pixels_clamped_c(block, dest, line_size);
4179
}
4180
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4181
{
4182
    j_rev_dct (block);
4183
    put_pixels_clamped_c(block, dest, line_size);
4184
}
4185
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4186
{
4187
    j_rev_dct (block);
4188
    add_pixels_clamped_c(block, dest, line_size);
4189
}
4190

    
4191
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4192
{
4193
    j_rev_dct4 (block);
4194
    put_pixels_clamped4_c(block, dest, line_size);
4195
}
4196
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4197
{
4198
    j_rev_dct4 (block);
4199
    add_pixels_clamped4_c(block, dest, line_size);
4200
}
4201

    
4202
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4203
{
4204
    j_rev_dct2 (block);
4205
    put_pixels_clamped2_c(block, dest, line_size);
4206
}
4207
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4208
{
4209
    j_rev_dct2 (block);
4210
    add_pixels_clamped2_c(block, dest, line_size);
4211
}
4212

    
4213
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4214
{
4215
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4216

    
4217
    dest[0] = cm[(block[0] + 4)>>3];
4218
}
4219
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4220
{
4221
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4222

    
4223
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4224
}
4225

    
4226
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4227

    
4228
/* init static data */
4229
void dsputil_static_init(void)
4230
{
4231
    int i;
4232

    
4233
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4234
    for(i=0;i<MAX_NEG_CROP;i++) {
4235
        ff_cropTbl[i] = 0;
4236
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4237
    }
4238

    
4239
    for(i=0;i<512;i++) {
4240
        ff_squareTbl[i] = (i - 256) * (i - 256</