Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 199436b9

History | View | Annotate | Download (164 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "h263.h"
37
#include "snow.h"
38

    
39
/* snow.c */
40
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41

    
42
/* vorbis.c */
43
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44

    
45
/* ac3dec.c */
46
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47

    
48
/* flacenc.c */
49
void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50

    
51
/* pngdec.c */
52
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53

    
54
/* eaidct.c */
55
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56

    
57
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58
uint32_t ff_squareTbl[512] = {0, };
59

    
60
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61
#define pb_7f (~0UL/255 * 0x7f)
62
#define pb_80 (~0UL/255 * 0x80)
63

    
64
const uint8_t ff_zigzag_direct[64] = {
65
    0,   1,  8, 16,  9,  2,  3, 10,
66
    17, 24, 32, 25, 18, 11,  4,  5,
67
    12, 19, 26, 33, 40, 48, 41, 34,
68
    27, 20, 13,  6,  7, 14, 21, 28,
69
    35, 42, 49, 56, 57, 50, 43, 36,
70
    29, 22, 15, 23, 30, 37, 44, 51,
71
    58, 59, 52, 45, 38, 31, 39, 46,
72
    53, 60, 61, 54, 47, 55, 62, 63
73
};
74

    
75
/* Specific zigzag scan for 248 idct. NOTE that unlike the
76
   specification, we interleave the fields */
77
const uint8_t ff_zigzag248_direct[64] = {
78
     0,  8,  1,  9, 16, 24,  2, 10,
79
    17, 25, 32, 40, 48, 56, 33, 41,
80
    18, 26,  3, 11,  4, 12, 19, 27,
81
    34, 42, 49, 57, 50, 58, 35, 43,
82
    20, 28,  5, 13,  6, 14, 21, 29,
83
    36, 44, 51, 59, 52, 60, 37, 45,
84
    22, 30,  7, 15, 23, 31, 38, 46,
85
    53, 61, 54, 62, 39, 47, 55, 63,
86
};
87

    
88
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89
DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90

    
91
const uint8_t ff_alternate_horizontal_scan[64] = {
92
    0,  1,   2,  3,  8,  9, 16, 17,
93
    10, 11,  4,  5,  6,  7, 15, 14,
94
    13, 12, 19, 18, 24, 25, 32, 33,
95
    26, 27, 20, 21, 22, 23, 28, 29,
96
    30, 31, 34, 35, 40, 41, 48, 49,
97
    42, 43, 36, 37, 38, 39, 44, 45,
98
    46, 47, 50, 51, 56, 57, 58, 59,
99
    52, 53, 54, 55, 60, 61, 62, 63,
100
};
101

    
102
const uint8_t ff_alternate_vertical_scan[64] = {
103
    0,  8,  16, 24,  1,  9,  2, 10,
104
    17, 25, 32, 40, 48, 56, 57, 49,
105
    41, 33, 26, 18,  3, 11,  4, 12,
106
    19, 27, 34, 42, 50, 58, 35, 43,
107
    51, 59, 20, 28,  5, 13,  6, 14,
108
    21, 29, 36, 44, 52, 60, 37, 45,
109
    53, 61, 22, 30,  7, 15, 23, 31,
110
    38, 46, 54, 62, 39, 47, 55, 63,
111
};
112

    
113
/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114
const uint32_t ff_inverse[256]={
115
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
116
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
117
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
118
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
119
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
120
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
121
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
122
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
123
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
124
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
125
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
126
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
127
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
128
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
129
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
130
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
131
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
132
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
133
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
134
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
135
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
136
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
137
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
138
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
139
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
140
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
141
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
142
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
143
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
144
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
145
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
146
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
147
};
148

    
149
/* Input permutation for the simple_idct_mmx */
150
static const uint8_t simple_mmx_permutation[64]={
151
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159
};
160

    
161
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162

    
163
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164
    int i;
165
    int end;
166

    
167
    st->scantable= src_scantable;
168

    
169
    for(i=0; i<64; i++){
170
        int j;
171
        j = src_scantable[i];
172
        st->permutated[i] = permutation[j];
173
#if ARCH_PPC
174
        st->inverse[j] = i;
175
#endif
176
    }
177

    
178
    end=-1;
179
    for(i=0; i<64; i++){
180
        int j;
181
        j = st->permutated[i];
182
        if(j>end) end=j;
183
        st->raster_end[i]= end;
184
    }
185
}
186

    
187
static int pix_sum_c(uint8_t * pix, int line_size)
188
{
189
    int s, i, j;
190

    
191
    s = 0;
192
    for (i = 0; i < 16; i++) {
193
        for (j = 0; j < 16; j += 8) {
194
            s += pix[0];
195
            s += pix[1];
196
            s += pix[2];
197
            s += pix[3];
198
            s += pix[4];
199
            s += pix[5];
200
            s += pix[6];
201
            s += pix[7];
202
            pix += 8;
203
        }
204
        pix += line_size - 16;
205
    }
206
    return s;
207
}
208

    
209
static int pix_norm1_c(uint8_t * pix, int line_size)
210
{
211
    int s, i, j;
212
    uint32_t *sq = ff_squareTbl + 256;
213

    
214
    s = 0;
215
    for (i = 0; i < 16; i++) {
216
        for (j = 0; j < 16; j += 8) {
217
#if 0
218
            s += sq[pix[0]];
219
            s += sq[pix[1]];
220
            s += sq[pix[2]];
221
            s += sq[pix[3]];
222
            s += sq[pix[4]];
223
            s += sq[pix[5]];
224
            s += sq[pix[6]];
225
            s += sq[pix[7]];
226
#else
227
#if LONG_MAX > 2147483647
228
            register uint64_t x=*(uint64_t*)pix;
229
            s += sq[x&0xff];
230
            s += sq[(x>>8)&0xff];
231
            s += sq[(x>>16)&0xff];
232
            s += sq[(x>>24)&0xff];
233
            s += sq[(x>>32)&0xff];
234
            s += sq[(x>>40)&0xff];
235
            s += sq[(x>>48)&0xff];
236
            s += sq[(x>>56)&0xff];
237
#else
238
            register uint32_t x=*(uint32_t*)pix;
239
            s += sq[x&0xff];
240
            s += sq[(x>>8)&0xff];
241
            s += sq[(x>>16)&0xff];
242
            s += sq[(x>>24)&0xff];
243
            x=*(uint32_t*)(pix+4);
244
            s += sq[x&0xff];
245
            s += sq[(x>>8)&0xff];
246
            s += sq[(x>>16)&0xff];
247
            s += sq[(x>>24)&0xff];
248
#endif
249
#endif
250
            pix += 8;
251
        }
252
        pix += line_size - 16;
253
    }
254
    return s;
255
}
256

    
257
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258
    int i;
259

    
260
    for(i=0; i+8<=w; i+=8){
261
        dst[i+0]= bswap_32(src[i+0]);
262
        dst[i+1]= bswap_32(src[i+1]);
263
        dst[i+2]= bswap_32(src[i+2]);
264
        dst[i+3]= bswap_32(src[i+3]);
265
        dst[i+4]= bswap_32(src[i+4]);
266
        dst[i+5]= bswap_32(src[i+5]);
267
        dst[i+6]= bswap_32(src[i+6]);
268
        dst[i+7]= bswap_32(src[i+7]);
269
    }
270
    for(;i<w; i++){
271
        dst[i+0]= bswap_32(src[i+0]);
272
    }
273
}
274

    
275
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276
{
277
    int s, i;
278
    uint32_t *sq = ff_squareTbl + 256;
279

    
280
    s = 0;
281
    for (i = 0; i < h; i++) {
282
        s += sq[pix1[0] - pix2[0]];
283
        s += sq[pix1[1] - pix2[1]];
284
        s += sq[pix1[2] - pix2[2]];
285
        s += sq[pix1[3] - pix2[3]];
286
        pix1 += line_size;
287
        pix2 += line_size;
288
    }
289
    return s;
290
}
291

    
292
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293
{
294
    int s, i;
295
    uint32_t *sq = ff_squareTbl + 256;
296

    
297
    s = 0;
298
    for (i = 0; i < h; i++) {
299
        s += sq[pix1[0] - pix2[0]];
300
        s += sq[pix1[1] - pix2[1]];
301
        s += sq[pix1[2] - pix2[2]];
302
        s += sq[pix1[3] - pix2[3]];
303
        s += sq[pix1[4] - pix2[4]];
304
        s += sq[pix1[5] - pix2[5]];
305
        s += sq[pix1[6] - pix2[6]];
306
        s += sq[pix1[7] - pix2[7]];
307
        pix1 += line_size;
308
        pix2 += line_size;
309
    }
310
    return s;
311
}
312

    
313
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314
{
315
    int s, i;
316
    uint32_t *sq = ff_squareTbl + 256;
317

    
318
    s = 0;
319
    for (i = 0; i < h; i++) {
320
        s += sq[pix1[ 0] - pix2[ 0]];
321
        s += sq[pix1[ 1] - pix2[ 1]];
322
        s += sq[pix1[ 2] - pix2[ 2]];
323
        s += sq[pix1[ 3] - pix2[ 3]];
324
        s += sq[pix1[ 4] - pix2[ 4]];
325
        s += sq[pix1[ 5] - pix2[ 5]];
326
        s += sq[pix1[ 6] - pix2[ 6]];
327
        s += sq[pix1[ 7] - pix2[ 7]];
328
        s += sq[pix1[ 8] - pix2[ 8]];
329
        s += sq[pix1[ 9] - pix2[ 9]];
330
        s += sq[pix1[10] - pix2[10]];
331
        s += sq[pix1[11] - pix2[11]];
332
        s += sq[pix1[12] - pix2[12]];
333
        s += sq[pix1[13] - pix2[13]];
334
        s += sq[pix1[14] - pix2[14]];
335
        s += sq[pix1[15] - pix2[15]];
336

    
337
        pix1 += line_size;
338
        pix2 += line_size;
339
    }
340
    return s;
341
}
342

    
343

    
344
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
345
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346
    int s, i, j;
347
    const int dec_count= w==8 ? 3 : 4;
348
    int tmp[32*32];
349
    int level, ori;
350
    static const int scale[2][2][4][4]={
351
      {
352
        {
353
            // 9/7 8x8 dec=3
354
            {268, 239, 239, 213},
355
            {  0, 224, 224, 152},
356
            {  0, 135, 135, 110},
357
        },{
358
            // 9/7 16x16 or 32x32 dec=4
359
            {344, 310, 310, 280},
360
            {  0, 320, 320, 228},
361
            {  0, 175, 175, 136},
362
            {  0, 129, 129, 102},
363
        }
364
      },{
365
        {
366
            // 5/3 8x8 dec=3
367
            {275, 245, 245, 218},
368
            {  0, 230, 230, 156},
369
            {  0, 138, 138, 113},
370
        },{
371
            // 5/3 16x16 or 32x32 dec=4
372
            {352, 317, 317, 286},
373
            {  0, 328, 328, 233},
374
            {  0, 180, 180, 140},
375
            {  0, 132, 132, 105},
376
        }
377
      }
378
    };
379

    
380
    for (i = 0; i < h; i++) {
381
        for (j = 0; j < w; j+=4) {
382
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386
        }
387
        pix1 += line_size;
388
        pix2 += line_size;
389
    }
390

    
391
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392

    
393
    s=0;
394
    assert(w==h);
395
    for(level=0; level<dec_count; level++){
396
        for(ori= level ? 1 : 0; ori<4; ori++){
397
            int size= w>>(dec_count-level);
398
            int sx= (ori&1) ? size : 0;
399
            int stride= 32<<(dec_count-level);
400
            int sy= (ori&2) ? stride>>1 : 0;
401

    
402
            for(i=0; i<size; i++){
403
                for(j=0; j<size; j++){
404
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405
                    s += FFABS(v);
406
                }
407
            }
408
        }
409
    }
410
    assert(s>=0);
411
    return s>>9;
412
}
413

    
414
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
416
}
417

    
418
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
420
}
421

    
422
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
424
}
425

    
426
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
428
}
429

    
430
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
432
}
433

    
434
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
436
}
437
#endif
438

    
439
/* draw the edges of width 'w' of an image of size width, height */
440
//FIXME check that this is ok for mpeg4 interlaced
441
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442
{
443
    uint8_t *ptr, *last_line;
444
    int i;
445

    
446
    last_line = buf + (height - 1) * wrap;
447
    for(i=0;i<w;i++) {
448
        /* top and bottom */
449
        memcpy(buf - (i + 1) * wrap, buf, width);
450
        memcpy(last_line + (i + 1) * wrap, last_line, width);
451
    }
452
    /* left and right */
453
    ptr = buf;
454
    for(i=0;i<height;i++) {
455
        memset(ptr - w, ptr[0], w);
456
        memset(ptr + width, ptr[width-1], w);
457
        ptr += wrap;
458
    }
459
    /* corners */
460
    for(i=0;i<w;i++) {
461
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465
    }
466
}
467

    
468
/**
469
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470
 * @param buf destination buffer
471
 * @param src source buffer
472
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473
 * @param block_w width of block
474
 * @param block_h height of block
475
 * @param src_x x coordinate of the top left sample of the block in the source buffer
476
 * @param src_y y coordinate of the top left sample of the block in the source buffer
477
 * @param w width of the source buffer
478
 * @param h height of the source buffer
479
 */
480
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481
                                    int src_x, int src_y, int w, int h){
482
    int x, y;
483
    int start_y, start_x, end_y, end_x;
484

    
485
    if(src_y>= h){
486
        src+= (h-1-src_y)*linesize;
487
        src_y=h-1;
488
    }else if(src_y<=-block_h){
489
        src+= (1-block_h-src_y)*linesize;
490
        src_y=1-block_h;
491
    }
492
    if(src_x>= w){
493
        src+= (w-1-src_x);
494
        src_x=w-1;
495
    }else if(src_x<=-block_w){
496
        src+= (1-block_w-src_x);
497
        src_x=1-block_w;
498
    }
499

    
500
    start_y= FFMAX(0, -src_y);
501
    start_x= FFMAX(0, -src_x);
502
    end_y= FFMIN(block_h, h-src_y);
503
    end_x= FFMIN(block_w, w-src_x);
504

    
505
    // copy existing part
506
    for(y=start_y; y<end_y; y++){
507
        for(x=start_x; x<end_x; x++){
508
            buf[x + y*linesize]= src[x + y*linesize];
509
        }
510
    }
511

    
512
    //top
513
    for(y=0; y<start_y; y++){
514
        for(x=start_x; x<end_x; x++){
515
            buf[x + y*linesize]= buf[x + start_y*linesize];
516
        }
517
    }
518

    
519
    //bottom
520
    for(y=end_y; y<block_h; y++){
521
        for(x=start_x; x<end_x; x++){
522
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
523
        }
524
    }
525

    
526
    for(y=0; y<block_h; y++){
527
       //left
528
        for(x=0; x<start_x; x++){
529
            buf[x + y*linesize]= buf[start_x + y*linesize];
530
        }
531

    
532
       //right
533
        for(x=end_x; x<block_w; x++){
534
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535
        }
536
    }
537
}
538

    
539
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540
{
541
    int i;
542

    
543
    /* read the pixels */
544
    for(i=0;i<8;i++) {
545
        block[0] = pixels[0];
546
        block[1] = pixels[1];
547
        block[2] = pixels[2];
548
        block[3] = pixels[3];
549
        block[4] = pixels[4];
550
        block[5] = pixels[5];
551
        block[6] = pixels[6];
552
        block[7] = pixels[7];
553
        pixels += line_size;
554
        block += 8;
555
    }
556
}
557

    
558
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559
                          const uint8_t *s2, int stride){
560
    int i;
561

    
562
    /* read the pixels */
563
    for(i=0;i<8;i++) {
564
        block[0] = s1[0] - s2[0];
565
        block[1] = s1[1] - s2[1];
566
        block[2] = s1[2] - s2[2];
567
        block[3] = s1[3] - s2[3];
568
        block[4] = s1[4] - s2[4];
569
        block[5] = s1[5] - s2[5];
570
        block[6] = s1[6] - s2[6];
571
        block[7] = s1[7] - s2[7];
572
        s1 += stride;
573
        s2 += stride;
574
        block += 8;
575
    }
576
}
577

    
578

    
579
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580
                                 int line_size)
581
{
582
    int i;
583
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584

    
585
    /* read the pixels */
586
    for(i=0;i<8;i++) {
587
        pixels[0] = cm[block[0]];
588
        pixels[1] = cm[block[1]];
589
        pixels[2] = cm[block[2]];
590
        pixels[3] = cm[block[3]];
591
        pixels[4] = cm[block[4]];
592
        pixels[5] = cm[block[5]];
593
        pixels[6] = cm[block[6]];
594
        pixels[7] = cm[block[7]];
595

    
596
        pixels += line_size;
597
        block += 8;
598
    }
599
}
600

    
601
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602
                                 int line_size)
603
{
604
    int i;
605
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606

    
607
    /* read the pixels */
608
    for(i=0;i<4;i++) {
609
        pixels[0] = cm[block[0]];
610
        pixels[1] = cm[block[1]];
611
        pixels[2] = cm[block[2]];
612
        pixels[3] = cm[block[3]];
613

    
614
        pixels += line_size;
615
        block += 8;
616
    }
617
}
618

    
619
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620
                                 int line_size)
621
{
622
    int i;
623
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624

    
625
    /* read the pixels */
626
    for(i=0;i<2;i++) {
627
        pixels[0] = cm[block[0]];
628
        pixels[1] = cm[block[1]];
629

    
630
        pixels += line_size;
631
        block += 8;
632
    }
633
}
634

    
635
static void put_signed_pixels_clamped_c(const DCTELEM *block,
636
                                        uint8_t *restrict pixels,
637
                                        int line_size)
638
{
639
    int i, j;
640

    
641
    for (i = 0; i < 8; i++) {
642
        for (j = 0; j < 8; j++) {
643
            if (*block < -128)
644
                *pixels = 0;
645
            else if (*block > 127)
646
                *pixels = 255;
647
            else
648
                *pixels = (uint8_t)(*block + 128);
649
            block++;
650
            pixels++;
651
        }
652
        pixels += (line_size - 8);
653
    }
654
}
655

    
656
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657
                          int line_size)
658
{
659
    int i;
660
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661

    
662
    /* read the pixels */
663
    for(i=0;i<8;i++) {
664
        pixels[0] = cm[pixels[0] + block[0]];
665
        pixels[1] = cm[pixels[1] + block[1]];
666
        pixels[2] = cm[pixels[2] + block[2]];
667
        pixels[3] = cm[pixels[3] + block[3]];
668
        pixels[4] = cm[pixels[4] + block[4]];
669
        pixels[5] = cm[pixels[5] + block[5]];
670
        pixels[6] = cm[pixels[6] + block[6]];
671
        pixels[7] = cm[pixels[7] + block[7]];
672
        pixels += line_size;
673
        block += 8;
674
    }
675
}
676

    
677
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
678
                          int line_size)
679
{
680
    int i;
681
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682

    
683
    /* read the pixels */
684
    for(i=0;i<4;i++) {
685
        pixels[0] = cm[pixels[0] + block[0]];
686
        pixels[1] = cm[pixels[1] + block[1]];
687
        pixels[2] = cm[pixels[2] + block[2]];
688
        pixels[3] = cm[pixels[3] + block[3]];
689
        pixels += line_size;
690
        block += 8;
691
    }
692
}
693

    
694
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
695
                          int line_size)
696
{
697
    int i;
698
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699

    
700
    /* read the pixels */
701
    for(i=0;i<2;i++) {
702
        pixels[0] = cm[pixels[0] + block[0]];
703
        pixels[1] = cm[pixels[1] + block[1]];
704
        pixels += line_size;
705
        block += 8;
706
    }
707
}
708

    
709
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
710
{
711
    int i;
712
    for(i=0;i<8;i++) {
713
        pixels[0] += block[0];
714
        pixels[1] += block[1];
715
        pixels[2] += block[2];
716
        pixels[3] += block[3];
717
        pixels[4] += block[4];
718
        pixels[5] += block[5];
719
        pixels[6] += block[6];
720
        pixels[7] += block[7];
721
        pixels += line_size;
722
        block += 8;
723
    }
724
}
725

    
726
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
727
{
728
    int i;
729
    for(i=0;i<4;i++) {
730
        pixels[0] += block[0];
731
        pixels[1] += block[1];
732
        pixels[2] += block[2];
733
        pixels[3] += block[3];
734
        pixels += line_size;
735
        block += 4;
736
    }
737
}
738

    
739
static int sum_abs_dctelem_c(DCTELEM *block)
740
{
741
    int sum=0, i;
742
    for(i=0; i<64; i++)
743
        sum+= FFABS(block[i]);
744
    return sum;
745
}
746

    
747
#if 0
748

749
#define PIXOP2(OPNAME, OP) \
750
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
751
{\
752
    int i;\
753
    for(i=0; i<h; i++){\
754
        OP(*((uint64_t*)block), AV_RN64(pixels));\
755
        pixels+=line_size;\
756
        block +=line_size;\
757
    }\
758
}\
759
\
760
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761
{\
762
    int i;\
763
    for(i=0; i<h; i++){\
764
        const uint64_t a= AV_RN64(pixels  );\
765
        const uint64_t b= AV_RN64(pixels+1);\
766
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
767
        pixels+=line_size;\
768
        block +=line_size;\
769
    }\
770
}\
771
\
772
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
773
{\
774
    int i;\
775
    for(i=0; i<h; i++){\
776
        const uint64_t a= AV_RN64(pixels  );\
777
        const uint64_t b= AV_RN64(pixels+1);\
778
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
779
        pixels+=line_size;\
780
        block +=line_size;\
781
    }\
782
}\
783
\
784
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785
{\
786
    int i;\
787
    for(i=0; i<h; i++){\
788
        const uint64_t a= AV_RN64(pixels          );\
789
        const uint64_t b= AV_RN64(pixels+line_size);\
790
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
791
        pixels+=line_size;\
792
        block +=line_size;\
793
    }\
794
}\
795
\
796
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797
{\
798
    int i;\
799
    for(i=0; i<h; i++){\
800
        const uint64_t a= AV_RN64(pixels          );\
801
        const uint64_t b= AV_RN64(pixels+line_size);\
802
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
803
        pixels+=line_size;\
804
        block +=line_size;\
805
    }\
806
}\
807
\
808
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
809
{\
810
        int i;\
811
        const uint64_t a= AV_RN64(pixels  );\
812
        const uint64_t b= AV_RN64(pixels+1);\
813
        uint64_t l0=  (a&0x0303030303030303ULL)\
814
                    + (b&0x0303030303030303ULL)\
815
                    + 0x0202020202020202ULL;\
816
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
818
        uint64_t l1,h1;\
819
\
820
        pixels+=line_size;\
821
        for(i=0; i<h; i+=2){\
822
            uint64_t a= AV_RN64(pixels  );\
823
            uint64_t b= AV_RN64(pixels+1);\
824
            l1=  (a&0x0303030303030303ULL)\
825
               + (b&0x0303030303030303ULL);\
826
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
829
            pixels+=line_size;\
830
            block +=line_size;\
831
            a= AV_RN64(pixels  );\
832
            b= AV_RN64(pixels+1);\
833
            l0=  (a&0x0303030303030303ULL)\
834
               + (b&0x0303030303030303ULL)\
835
               + 0x0202020202020202ULL;\
836
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
839
            pixels+=line_size;\
840
            block +=line_size;\
841
        }\
842
}\
843
\
844
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845
{\
846
        int i;\
847
        const uint64_t a= AV_RN64(pixels  );\
848
        const uint64_t b= AV_RN64(pixels+1);\
849
        uint64_t l0=  (a&0x0303030303030303ULL)\
850
                    + (b&0x0303030303030303ULL)\
851
                    + 0x0101010101010101ULL;\
852
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
854
        uint64_t l1,h1;\
855
\
856
        pixels+=line_size;\
857
        for(i=0; i<h; i+=2){\
858
            uint64_t a= AV_RN64(pixels  );\
859
            uint64_t b= AV_RN64(pixels+1);\
860
            l1=  (a&0x0303030303030303ULL)\
861
               + (b&0x0303030303030303ULL);\
862
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
865
            pixels+=line_size;\
866
            block +=line_size;\
867
            a= AV_RN64(pixels  );\
868
            b= AV_RN64(pixels+1);\
869
            l0=  (a&0x0303030303030303ULL)\
870
               + (b&0x0303030303030303ULL)\
871
               + 0x0101010101010101ULL;\
872
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875
            pixels+=line_size;\
876
            block +=line_size;\
877
        }\
878
}\
879
\
880
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
881
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887

888
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889
#else // 64 bit variant
890

    
891
#define PIXOP2(OPNAME, OP) \
892
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893
    int i;\
894
    for(i=0; i<h; i++){\
895
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
896
        pixels+=line_size;\
897
        block +=line_size;\
898
    }\
899
}\
900
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901
    int i;\
902
    for(i=0; i<h; i++){\
903
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
904
        pixels+=line_size;\
905
        block +=line_size;\
906
    }\
907
}\
908
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909
    int i;\
910
    for(i=0; i<h; i++){\
911
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
912
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
913
        pixels+=line_size;\
914
        block +=line_size;\
915
    }\
916
}\
917
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
919
}\
920
\
921
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922
                                                int src_stride1, int src_stride2, int h){\
923
    int i;\
924
    for(i=0; i<h; i++){\
925
        uint32_t a,b;\
926
        a= AV_RN32(&src1[i*src_stride1  ]);\
927
        b= AV_RN32(&src2[i*src_stride2  ]);\
928
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
929
        a= AV_RN32(&src1[i*src_stride1+4]);\
930
        b= AV_RN32(&src2[i*src_stride2+4]);\
931
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
932
    }\
933
}\
934
\
935
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936
                                                int src_stride1, int src_stride2, int h){\
937
    int i;\
938
    for(i=0; i<h; i++){\
939
        uint32_t a,b;\
940
        a= AV_RN32(&src1[i*src_stride1  ]);\
941
        b= AV_RN32(&src2[i*src_stride2  ]);\
942
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
943
        a= AV_RN32(&src1[i*src_stride1+4]);\
944
        b= AV_RN32(&src2[i*src_stride2+4]);\
945
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
946
    }\
947
}\
948
\
949
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950
                                                int src_stride1, int src_stride2, int h){\
951
    int i;\
952
    for(i=0; i<h; i++){\
953
        uint32_t a,b;\
954
        a= AV_RN32(&src1[i*src_stride1  ]);\
955
        b= AV_RN32(&src2[i*src_stride2  ]);\
956
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
957
    }\
958
}\
959
\
960
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961
                                                int src_stride1, int src_stride2, int h){\
962
    int i;\
963
    for(i=0; i<h; i++){\
964
        uint32_t a,b;\
965
        a= AV_RN16(&src1[i*src_stride1  ]);\
966
        b= AV_RN16(&src2[i*src_stride2  ]);\
967
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
968
    }\
969
}\
970
\
971
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972
                                                int src_stride1, int src_stride2, int h){\
973
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
974
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
975
}\
976
\
977
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978
                                                int src_stride1, int src_stride2, int h){\
979
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
980
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
981
}\
982
\
983
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
985
}\
986
\
987
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
989
}\
990
\
991
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
993
}\
994
\
995
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
997
}\
998
\
999
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001
    int i;\
1002
    for(i=0; i<h; i++){\
1003
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1004
        a= AV_RN32(&src1[i*src_stride1]);\
1005
        b= AV_RN32(&src2[i*src_stride2]);\
1006
        c= AV_RN32(&src3[i*src_stride3]);\
1007
        d= AV_RN32(&src4[i*src_stride4]);\
1008
        l0=  (a&0x03030303UL)\
1009
           + (b&0x03030303UL)\
1010
           + 0x02020202UL;\
1011
        h0= ((a&0xFCFCFCFCUL)>>2)\
1012
          + ((b&0xFCFCFCFCUL)>>2);\
1013
        l1=  (c&0x03030303UL)\
1014
           + (d&0x03030303UL);\
1015
        h1= ((c&0xFCFCFCFCUL)>>2)\
1016
          + ((d&0xFCFCFCFCUL)>>2);\
1017
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018
        a= AV_RN32(&src1[i*src_stride1+4]);\
1019
        b= AV_RN32(&src2[i*src_stride2+4]);\
1020
        c= AV_RN32(&src3[i*src_stride3+4]);\
1021
        d= AV_RN32(&src4[i*src_stride4+4]);\
1022
        l0=  (a&0x03030303UL)\
1023
           + (b&0x03030303UL)\
1024
           + 0x02020202UL;\
1025
        h0= ((a&0xFCFCFCFCUL)>>2)\
1026
          + ((b&0xFCFCFCFCUL)>>2);\
1027
        l1=  (c&0x03030303UL)\
1028
           + (d&0x03030303UL);\
1029
        h1= ((c&0xFCFCFCFCUL)>>2)\
1030
          + ((d&0xFCFCFCFCUL)>>2);\
1031
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032
    }\
1033
}\
1034
\
1035
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037
}\
1038
\
1039
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041
}\
1042
\
1043
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045
}\
1046
\
1047
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049
}\
1050
\
1051
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053
    int i;\
1054
    for(i=0; i<h; i++){\
1055
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1056
        a= AV_RN32(&src1[i*src_stride1]);\
1057
        b= AV_RN32(&src2[i*src_stride2]);\
1058
        c= AV_RN32(&src3[i*src_stride3]);\
1059
        d= AV_RN32(&src4[i*src_stride4]);\
1060
        l0=  (a&0x03030303UL)\
1061
           + (b&0x03030303UL)\
1062
           + 0x01010101UL;\
1063
        h0= ((a&0xFCFCFCFCUL)>>2)\
1064
          + ((b&0xFCFCFCFCUL)>>2);\
1065
        l1=  (c&0x03030303UL)\
1066
           + (d&0x03030303UL);\
1067
        h1= ((c&0xFCFCFCFCUL)>>2)\
1068
          + ((d&0xFCFCFCFCUL)>>2);\
1069
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070
        a= AV_RN32(&src1[i*src_stride1+4]);\
1071
        b= AV_RN32(&src2[i*src_stride2+4]);\
1072
        c= AV_RN32(&src3[i*src_stride3+4]);\
1073
        d= AV_RN32(&src4[i*src_stride4+4]);\
1074
        l0=  (a&0x03030303UL)\
1075
           + (b&0x03030303UL)\
1076
           + 0x01010101UL;\
1077
        h0= ((a&0xFCFCFCFCUL)>>2)\
1078
          + ((b&0xFCFCFCFCUL)>>2);\
1079
        l1=  (c&0x03030303UL)\
1080
           + (d&0x03030303UL);\
1081
        h1= ((c&0xFCFCFCFCUL)>>2)\
1082
          + ((d&0xFCFCFCFCUL)>>2);\
1083
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084
    }\
1085
}\
1086
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090
}\
1091
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095
}\
1096
\
1097
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098
{\
1099
        int i, a0, b0, a1, b1;\
1100
        a0= pixels[0];\
1101
        b0= pixels[1] + 2;\
1102
        a0 += b0;\
1103
        b0 += pixels[2];\
1104
\
1105
        pixels+=line_size;\
1106
        for(i=0; i<h; i+=2){\
1107
            a1= pixels[0];\
1108
            b1= pixels[1];\
1109
            a1 += b1;\
1110
            b1 += pixels[2];\
1111
\
1112
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1113
            block[1]= (b1+b0)>>2;\
1114
\
1115
            pixels+=line_size;\
1116
            block +=line_size;\
1117
\
1118
            a0= pixels[0];\
1119
            b0= pixels[1] + 2;\
1120
            a0 += b0;\
1121
            b0 += pixels[2];\
1122
\
1123
            block[0]= (a1+a0)>>2;\
1124
            block[1]= (b1+b0)>>2;\
1125
            pixels+=line_size;\
1126
            block +=line_size;\
1127
        }\
1128
}\
1129
\
1130
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131
{\
1132
        int i;\
1133
        const uint32_t a= AV_RN32(pixels  );\
1134
        const uint32_t b= AV_RN32(pixels+1);\
1135
        uint32_t l0=  (a&0x03030303UL)\
1136
                    + (b&0x03030303UL)\
1137
                    + 0x02020202UL;\
1138
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139
                   + ((b&0xFCFCFCFCUL)>>2);\
1140
        uint32_t l1,h1;\
1141
\
1142
        pixels+=line_size;\
1143
        for(i=0; i<h; i+=2){\
1144
            uint32_t a= AV_RN32(pixels  );\
1145
            uint32_t b= AV_RN32(pixels+1);\
1146
            l1=  (a&0x03030303UL)\
1147
               + (b&0x03030303UL);\
1148
            h1= ((a&0xFCFCFCFCUL)>>2)\
1149
              + ((b&0xFCFCFCFCUL)>>2);\
1150
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151
            pixels+=line_size;\
1152
            block +=line_size;\
1153
            a= AV_RN32(pixels  );\
1154
            b= AV_RN32(pixels+1);\
1155
            l0=  (a&0x03030303UL)\
1156
               + (b&0x03030303UL)\
1157
               + 0x02020202UL;\
1158
            h0= ((a&0xFCFCFCFCUL)>>2)\
1159
              + ((b&0xFCFCFCFCUL)>>2);\
1160
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161
            pixels+=line_size;\
1162
            block +=line_size;\
1163
        }\
1164
}\
1165
\
1166
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167
{\
1168
    int j;\
1169
    for(j=0; j<2; j++){\
1170
        int i;\
1171
        const uint32_t a= AV_RN32(pixels  );\
1172
        const uint32_t b= AV_RN32(pixels+1);\
1173
        uint32_t l0=  (a&0x03030303UL)\
1174
                    + (b&0x03030303UL)\
1175
                    + 0x02020202UL;\
1176
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177
                   + ((b&0xFCFCFCFCUL)>>2);\
1178
        uint32_t l1,h1;\
1179
\
1180
        pixels+=line_size;\
1181
        for(i=0; i<h; i+=2){\
1182
            uint32_t a= AV_RN32(pixels  );\
1183
            uint32_t b= AV_RN32(pixels+1);\
1184
            l1=  (a&0x03030303UL)\
1185
               + (b&0x03030303UL);\
1186
            h1= ((a&0xFCFCFCFCUL)>>2)\
1187
              + ((b&0xFCFCFCFCUL)>>2);\
1188
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189
            pixels+=line_size;\
1190
            block +=line_size;\
1191
            a= AV_RN32(pixels  );\
1192
            b= AV_RN32(pixels+1);\
1193
            l0=  (a&0x03030303UL)\
1194
               + (b&0x03030303UL)\
1195
               + 0x02020202UL;\
1196
            h0= ((a&0xFCFCFCFCUL)>>2)\
1197
              + ((b&0xFCFCFCFCUL)>>2);\
1198
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199
            pixels+=line_size;\
1200
            block +=line_size;\
1201
        }\
1202
        pixels+=4-line_size*(h+1);\
1203
        block +=4-line_size*h;\
1204
    }\
1205
}\
1206
\
1207
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208
{\
1209
    int j;\
1210
    for(j=0; j<2; j++){\
1211
        int i;\
1212
        const uint32_t a= AV_RN32(pixels  );\
1213
        const uint32_t b= AV_RN32(pixels+1);\
1214
        uint32_t l0=  (a&0x03030303UL)\
1215
                    + (b&0x03030303UL)\
1216
                    + 0x01010101UL;\
1217
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218
                   + ((b&0xFCFCFCFCUL)>>2);\
1219
        uint32_t l1,h1;\
1220
\
1221
        pixels+=line_size;\
1222
        for(i=0; i<h; i+=2){\
1223
            uint32_t a= AV_RN32(pixels  );\
1224
            uint32_t b= AV_RN32(pixels+1);\
1225
            l1=  (a&0x03030303UL)\
1226
               + (b&0x03030303UL);\
1227
            h1= ((a&0xFCFCFCFCUL)>>2)\
1228
              + ((b&0xFCFCFCFCUL)>>2);\
1229
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230
            pixels+=line_size;\
1231
            block +=line_size;\
1232
            a= AV_RN32(pixels  );\
1233
            b= AV_RN32(pixels+1);\
1234
            l0=  (a&0x03030303UL)\
1235
               + (b&0x03030303UL)\
1236
               + 0x01010101UL;\
1237
            h0= ((a&0xFCFCFCFCUL)>>2)\
1238
              + ((b&0xFCFCFCFCUL)>>2);\
1239
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240
            pixels+=line_size;\
1241
            block +=line_size;\
1242
        }\
1243
        pixels+=4-line_size*(h+1);\
1244
        block +=4-line_size*h;\
1245
    }\
1246
}\
1247
\
1248
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1249
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256

    
1257
#define op_avg(a, b) a = rnd_avg32(a, b)
1258
#endif
1259
#define op_put(a, b) a = b
1260

    
1261
PIXOP2(avg, op_avg)
1262
PIXOP2(put, op_put)
1263
#undef op_avg
1264
#undef op_put
1265

    
1266
#define avg2(a,b) ((a+b+1)>>1)
1267
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268

    
1269
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271
}
1272

    
1273
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275
}
1276

    
1277
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278
{
1279
    const int A=(16-x16)*(16-y16);
1280
    const int B=(   x16)*(16-y16);
1281
    const int C=(16-x16)*(   y16);
1282
    const int D=(   x16)*(   y16);
1283
    int i;
1284

    
1285
    for(i=0; i<h; i++)
1286
    {
1287
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295
        dst+= stride;
1296
        src+= stride;
1297
    }
1298
}
1299

    
1300
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302
{
1303
    int y, vx, vy;
1304
    const int s= 1<<shift;
1305

    
1306
    width--;
1307
    height--;
1308

    
1309
    for(y=0; y<h; y++){
1310
        int x;
1311

    
1312
        vx= ox;
1313
        vy= oy;
1314
        for(x=0; x<8; x++){ //XXX FIXME optimize
1315
            int src_x, src_y, frac_x, frac_y, index;
1316

    
1317
            src_x= vx>>16;
1318
            src_y= vy>>16;
1319
            frac_x= src_x&(s-1);
1320
            frac_y= src_y&(s-1);
1321
            src_x>>=shift;
1322
            src_y>>=shift;
1323

    
1324
            if((unsigned)src_x < width){
1325
                if((unsigned)src_y < height){
1326
                    index= src_x + src_y*stride;
1327
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1328
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1329
                                        + (  src[index+stride  ]*(s-frac_x)
1330
                                           + src[index+stride+1]*   frac_x )*   frac_y
1331
                                        + r)>>(shift*2);
1332
                }else{
1333
                    index= src_x + av_clip(src_y, 0, height)*stride;
1334
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1335
                                          + src[index       +1]*   frac_x )*s
1336
                                        + r)>>(shift*2);
1337
                }
1338
            }else{
1339
                if((unsigned)src_y < height){
1340
                    index= av_clip(src_x, 0, width) + src_y*stride;
1341
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1342
                                           + src[index+stride  ]*   frac_y )*s
1343
                                        + r)>>(shift*2);
1344
                }else{
1345
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346
                    dst[y*stride + x]=    src[index         ];
1347
                }
1348
            }
1349

    
1350
            vx+= dxx;
1351
            vy+= dyx;
1352
        }
1353
        ox += dxy;
1354
        oy += dyy;
1355
    }
1356
}
1357

    
1358
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359
    switch(width){
1360
    case 2: put_pixels2_c (dst, src, stride, height); break;
1361
    case 4: put_pixels4_c (dst, src, stride, height); break;
1362
    case 8: put_pixels8_c (dst, src, stride, height); break;
1363
    case 16:put_pixels16_c(dst, src, stride, height); break;
1364
    }
1365
}
1366

    
1367
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368
    int i,j;
1369
    for (i=0; i < height; i++) {
1370
      for (j=0; j < width; j++) {
1371
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372
      }
1373
      src += stride;
1374
      dst += stride;
1375
    }
1376
}
1377

    
1378
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379
    int i,j;
1380
    for (i=0; i < height; i++) {
1381
      for (j=0; j < width; j++) {
1382
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383
      }
1384
      src += stride;
1385
      dst += stride;
1386
    }
1387
}
1388

    
1389
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390
    int i,j;
1391
    for (i=0; i < height; i++) {
1392
      for (j=0; j < width; j++) {
1393
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394
      }
1395
      src += stride;
1396
      dst += stride;
1397
    }
1398
}
1399

    
1400
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401
    int i,j;
1402
    for (i=0; i < height; i++) {
1403
      for (j=0; j < width; j++) {
1404
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405
      }
1406
      src += stride;
1407
      dst += stride;
1408
    }
1409
}
1410

    
1411
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412
    int i,j;
1413
    for (i=0; i < height; i++) {
1414
      for (j=0; j < width; j++) {
1415
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416
      }
1417
      src += stride;
1418
      dst += stride;
1419
    }
1420
}
1421

    
1422
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423
    int i,j;
1424
    for (i=0; i < height; i++) {
1425
      for (j=0; j < width; j++) {
1426
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427
      }
1428
      src += stride;
1429
      dst += stride;
1430
    }
1431
}
1432

    
1433
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434
    int i,j;
1435
    for (i=0; i < height; i++) {
1436
      for (j=0; j < width; j++) {
1437
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438
      }
1439
      src += stride;
1440
      dst += stride;
1441
    }
1442
}
1443

    
1444
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445
    int i,j;
1446
    for (i=0; i < height; i++) {
1447
      for (j=0; j < width; j++) {
1448
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449
      }
1450
      src += stride;
1451
      dst += stride;
1452
    }
1453
}
1454

    
1455
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456
    switch(width){
1457
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1458
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1459
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1460
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1461
    }
1462
}
1463

    
1464
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465
    int i,j;
1466
    for (i=0; i < height; i++) {
1467
      for (j=0; j < width; j++) {
1468
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469
      }
1470
      src += stride;
1471
      dst += stride;
1472
    }
1473
}
1474

    
1475
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476
    int i,j;
1477
    for (i=0; i < height; i++) {
1478
      for (j=0; j < width; j++) {
1479
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480
      }
1481
      src += stride;
1482
      dst += stride;
1483
    }
1484
}
1485

    
1486
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487
    int i,j;
1488
    for (i=0; i < height; i++) {
1489
      for (j=0; j < width; j++) {
1490
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491
      }
1492
      src += stride;
1493
      dst += stride;
1494
    }
1495
}
1496

    
1497
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498
    int i,j;
1499
    for (i=0; i < height; i++) {
1500
      for (j=0; j < width; j++) {
1501
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502
      }
1503
      src += stride;
1504
      dst += stride;
1505
    }
1506
}
1507

    
1508
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509
    int i,j;
1510
    for (i=0; i < height; i++) {
1511
      for (j=0; j < width; j++) {
1512
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513
      }
1514
      src += stride;
1515
      dst += stride;
1516
    }
1517
}
1518

    
1519
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520
    int i,j;
1521
    for (i=0; i < height; i++) {
1522
      for (j=0; j < width; j++) {
1523
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524
      }
1525
      src += stride;
1526
      dst += stride;
1527
    }
1528
}
1529

    
1530
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531
    int i,j;
1532
    for (i=0; i < height; i++) {
1533
      for (j=0; j < width; j++) {
1534
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535
      }
1536
      src += stride;
1537
      dst += stride;
1538
    }
1539
}
1540

    
1541
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542
    int i,j;
1543
    for (i=0; i < height; i++) {
1544
      for (j=0; j < width; j++) {
1545
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546
      }
1547
      src += stride;
1548
      dst += stride;
1549
    }
1550
}
1551
#if 0
1552
#define TPEL_WIDTH(width)\
1553
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571
#endif
1572

    
1573
#define H264_CHROMA_MC(OPNAME, OP)\
1574
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575
    const int A=(8-x)*(8-y);\
1576
    const int B=(  x)*(8-y);\
1577
    const int C=(8-x)*(  y);\
1578
    const int D=(  x)*(  y);\
1579
    int i;\
1580
    \
1581
    assert(x<8 && y<8 && x>=0 && y>=0);\
1582
\
1583
    if(D){\
1584
        for(i=0; i<h; i++){\
1585
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587
            dst+= stride;\
1588
            src+= stride;\
1589
        }\
1590
    }else{\
1591
        const int E= B+C;\
1592
        const int step= C ? stride : 1;\
1593
        for(i=0; i<h; i++){\
1594
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1595
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1596
            dst+= stride;\
1597
            src+= stride;\
1598
        }\
1599
    }\
1600
}\
1601
\
1602
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603
    const int A=(8-x)*(8-y);\
1604
    const int B=(  x)*(8-y);\
1605
    const int C=(8-x)*(  y);\
1606
    const int D=(  x)*(  y);\
1607
    int i;\
1608
    \
1609
    assert(x<8 && y<8 && x>=0 && y>=0);\
1610
\
1611
    if(D){\
1612
        for(i=0; i<h; i++){\
1613
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617
            dst+= stride;\
1618
            src+= stride;\
1619
        }\
1620
    }else{\
1621
        const int E= B+C;\
1622
        const int step= C ? stride : 1;\
1623
        for(i=0; i<h; i++){\
1624
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1625
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1626
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1627
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1628
            dst+= stride;\
1629
            src+= stride;\
1630
        }\
1631
    }\
1632
}\
1633
\
1634
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635
    const int A=(8-x)*(8-y);\
1636
    const int B=(  x)*(8-y);\
1637
    const int C=(8-x)*(  y);\
1638
    const int D=(  x)*(  y);\
1639
    int i;\
1640
    \
1641
    assert(x<8 && y<8 && x>=0 && y>=0);\
1642
\
1643
    if(D){\
1644
        for(i=0; i<h; i++){\
1645
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653
            dst+= stride;\
1654
            src+= stride;\
1655
        }\
1656
    }else{\
1657
        const int E= B+C;\
1658
        const int step= C ? stride : 1;\
1659
        for(i=0; i<h; i++){\
1660
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1661
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1662
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1663
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1664
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1665
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1666
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1667
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1668
            dst+= stride;\
1669
            src+= stride;\
1670
        }\
1671
    }\
1672
}
1673

    
1674
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675
#define op_put(a, b) a = (((b) + 32)>>6)
1676

    
1677
H264_CHROMA_MC(put_       , op_put)
1678
H264_CHROMA_MC(avg_       , op_avg)
1679
#undef op_avg
1680
#undef op_put
1681

    
1682
static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683
    const int A=(8-x)*(8-y);
1684
    const int B=(  x)*(8-y);
1685
    const int C=(8-x)*(  y);
1686
    const int D=(  x)*(  y);
1687
    int i;
1688

    
1689
    assert(x<8 && y<8 && x>=0 && y>=0);
1690

    
1691
    for(i=0; i<h; i++)
1692
    {
1693
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701
        dst+= stride;
1702
        src+= stride;
1703
    }
1704
}
1705

    
1706
#define QPEL_MC(r, OPNAME, RND, OP) \
1707
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1708
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1709
    int i;\
1710
    for(i=0; i<h; i++)\
1711
    {\
1712
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1713
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1714
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1715
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1716
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1717
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1718
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1719
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1720
        dst+=dstStride;\
1721
        src+=srcStride;\
1722
    }\
1723
}\
1724
\
1725
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726
    const int w=8;\
1727
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728
    int i;\
1729
    for(i=0; i<w; i++)\
1730
    {\
1731
        const int src0= src[0*srcStride];\
1732
        const int src1= src[1*srcStride];\
1733
        const int src2= src[2*srcStride];\
1734
        const int src3= src[3*srcStride];\
1735
        const int src4= src[4*srcStride];\
1736
        const int src5= src[5*srcStride];\
1737
        const int src6= src[6*srcStride];\
1738
        const int src7= src[7*srcStride];\
1739
        const int src8= src[8*srcStride];\
1740
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1741
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1742
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1743
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1744
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1745
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1746
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1747
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1748
        dst++;\
1749
        src++;\
1750
    }\
1751
}\
1752
\
1753
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1754
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755
    int i;\
1756
    \
1757
    for(i=0; i<h; i++)\
1758
    {\
1759
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1760
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1761
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1762
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1763
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1764
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1765
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1766
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1767
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1768
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1769
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1770
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1771
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1772
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1773
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1774
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1775
        dst+=dstStride;\
1776
        src+=srcStride;\
1777
    }\
1778
}\
1779
\
1780
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1781
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782
    int i;\
1783
    const int w=16;\
1784
    for(i=0; i<w; i++)\
1785
    {\
1786
        const int src0= src[0*srcStride];\
1787
        const int src1= src[1*srcStride];\
1788
        const int src2= src[2*srcStride];\
1789
        const int src3= src[3*srcStride];\
1790
        const int src4= src[4*srcStride];\
1791
        const int src5= src[5*srcStride];\
1792
        const int src6= src[6*srcStride];\
1793
        const int src7= src[7*srcStride];\
1794
        const int src8= src[8*srcStride];\
1795
        const int src9= src[9*srcStride];\
1796
        const int src10= src[10*srcStride];\
1797
        const int src11= src[11*srcStride];\
1798
        const int src12= src[12*srcStride];\
1799
        const int src13= src[13*srcStride];\
1800
        const int src14= src[14*srcStride];\
1801
        const int src15= src[15*srcStride];\
1802
        const int src16= src[16*srcStride];\
1803
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1804
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1805
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1806
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1807
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1808
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1809
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1810
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1811
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1812
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1813
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1814
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1815
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1816
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1817
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1818
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1819
        dst++;\
1820
        src++;\
1821
    }\
1822
}\
1823
\
1824
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1826
}\
1827
\
1828
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829
    uint8_t half[64];\
1830
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1831
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1832
}\
1833
\
1834
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1836
}\
1837
\
1838
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839
    uint8_t half[64];\
1840
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1841
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1842
}\
1843
\
1844
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845
    uint8_t full[16*9];\
1846
    uint8_t half[64];\
1847
    copy_block9(full, src, 16, stride, 9);\
1848
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1849
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1850
}\
1851
\
1852
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853
    uint8_t full[16*9];\
1854
    copy_block9(full, src, 16, stride, 9);\
1855
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1856
}\
1857
\
1858
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859
    uint8_t full[16*9];\
1860
    uint8_t half[64];\
1861
    copy_block9(full, src, 16, stride, 9);\
1862
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1863
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864
}\
1865
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866
    uint8_t full[16*9];\
1867
    uint8_t halfH[72];\
1868
    uint8_t halfV[64];\
1869
    uint8_t halfHV[64];\
1870
    copy_block9(full, src, 16, stride, 9);\
1871
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1873
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1874
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875
}\
1876
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877
    uint8_t full[16*9];\
1878
    uint8_t halfH[72];\
1879
    uint8_t halfHV[64];\
1880
    copy_block9(full, src, 16, stride, 9);\
1881
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1883
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885
}\
1886
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t full[16*9];\
1888
    uint8_t halfH[72];\
1889
    uint8_t halfV[64];\
1890
    uint8_t halfHV[64];\
1891
    copy_block9(full, src, 16, stride, 9);\
1892
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896
}\
1897
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t full[16*9];\
1899
    uint8_t halfH[72];\
1900
    uint8_t halfHV[64];\
1901
    copy_block9(full, src, 16, stride, 9);\
1902
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1903
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1904
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1905
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906
}\
1907
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908
    uint8_t full[16*9];\
1909
    uint8_t halfH[72];\
1910
    uint8_t halfV[64];\
1911
    uint8_t halfHV[64];\
1912
    copy_block9(full, src, 16, stride, 9);\
1913
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1915
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1916
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917
}\
1918
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919
    uint8_t full[16*9];\
1920
    uint8_t halfH[72];\
1921
    uint8_t halfHV[64];\
1922
    copy_block9(full, src, 16, stride, 9);\
1923
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1925
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927
}\
1928
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929
    uint8_t full[16*9];\
1930
    uint8_t halfH[72];\
1931
    uint8_t halfV[64];\
1932
    uint8_t halfHV[64];\
1933
    copy_block9(full, src, 16, stride, 9);\
1934
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1935
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1936
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1937
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938
}\
1939
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940
    uint8_t full[16*9];\
1941
    uint8_t halfH[72];\
1942
    uint8_t halfHV[64];\
1943
    copy_block9(full, src, 16, stride, 9);\
1944
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1946
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948
}\
1949
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950
    uint8_t halfH[72];\
1951
    uint8_t halfHV[64];\
1952
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955
}\
1956
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t halfH[72];\
1958
    uint8_t halfHV[64];\
1959
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1960
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962
}\
1963
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964
    uint8_t full[16*9];\
1965
    uint8_t halfH[72];\
1966
    uint8_t halfV[64];\
1967
    uint8_t halfHV[64];\
1968
    copy_block9(full, src, 16, stride, 9);\
1969
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1971
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1972
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973
}\
1974
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975
    uint8_t full[16*9];\
1976
    uint8_t halfH[72];\
1977
    copy_block9(full, src, 16, stride, 9);\
1978
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1979
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1980
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981
}\
1982
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983
    uint8_t full[16*9];\
1984
    uint8_t halfH[72];\
1985
    uint8_t halfV[64];\
1986
    uint8_t halfHV[64];\
1987
    copy_block9(full, src, 16, stride, 9);\
1988
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1990
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1991
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992
}\
1993
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t full[16*9];\
1995
    uint8_t halfH[72];\
1996
    copy_block9(full, src, 16, stride, 9);\
1997
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1999
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000
}\
2001
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002
    uint8_t halfH[72];\
2003
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2004
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005
}\
2006
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2007
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2008
}\
2009
\
2010
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2011
    uint8_t half[256];\
2012
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2013
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2014
}\
2015
\
2016
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2017
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2018
}\
2019
\
2020
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t half[256];\
2022
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2023
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2024
}\
2025
\
2026
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2027
    uint8_t full[24*17];\
2028
    uint8_t half[256];\
2029
    copy_block17(full, src, 24, stride, 17);\
2030
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2031
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2032
}\
2033
\
2034
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2035
    uint8_t full[24*17];\
2036
    copy_block17(full, src, 24, stride, 17);\
2037
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2038
}\
2039
\
2040
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2041
    uint8_t full[24*17];\
2042
    uint8_t half[256];\
2043
    copy_block17(full, src, 24, stride, 17);\
2044
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2045
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046
}\
2047
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2048
    uint8_t full[24*17];\
2049
    uint8_t halfH[272];\
2050
    uint8_t halfV[256];\
2051
    uint8_t halfHV[256];\
2052
    copy_block17(full, src, 24, stride, 17);\
2053
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2055
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2056
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057
}\
2058
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059
    uint8_t full[24*17];\
2060
    uint8_t halfH[272];\
2061
    uint8_t halfHV[256];\
2062
    copy_block17(full, src, 24, stride, 17);\
2063
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2065
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067
}\
2068
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2069
    uint8_t full[24*17];\
2070
    uint8_t halfH[272];\
2071
    uint8_t halfV[256];\
2072
    uint8_t halfHV[256];\
2073
    copy_block17(full, src, 24, stride, 17);\
2074
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2075
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2076
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2077
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078
}\
2079
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2080
    uint8_t full[24*17];\
2081
    uint8_t halfH[272];\
2082
    uint8_t halfHV[256];\
2083
    copy_block17(full, src, 24, stride, 17);\
2084
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2086
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088
}\
2089
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2090
    uint8_t full[24*17];\
2091
    uint8_t halfH[272];\
2092
    uint8_t halfV[256];\
2093
    uint8_t halfHV[256];\
2094
    copy_block17(full, src, 24, stride, 17);\
2095
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2096
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2097
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2098
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099
}\
2100
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2101
    uint8_t full[24*17];\
2102
    uint8_t halfH[272];\
2103
    uint8_t halfHV[256];\
2104
    copy_block17(full, src, 24, stride, 17);\
2105
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2106
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2107
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2108
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109
}\
2110
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2111
    uint8_t full[24*17];\
2112
    uint8_t halfH[272];\
2113
    uint8_t halfV[256];\
2114
    uint8_t halfHV[256];\
2115
    copy_block17(full, src, 24, stride, 17);\
2116
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2117
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2118
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2119
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120
}\
2121
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2122
    uint8_t full[24*17];\
2123
    uint8_t halfH[272];\
2124
    uint8_t halfHV[256];\
2125
    copy_block17(full, src, 24, stride, 17);\
2126
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2128
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130
}\
2131
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2132
    uint8_t halfH[272];\
2133
    uint8_t halfHV[256];\
2134
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137
}\
2138
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2139
    uint8_t halfH[272];\
2140
    uint8_t halfHV[256];\
2141
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2142
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144
}\
2145
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2146
    uint8_t full[24*17];\
2147
    uint8_t halfH[272];\
2148
    uint8_t halfV[256];\
2149
    uint8_t halfHV[256];\
2150
    copy_block17(full, src, 24, stride, 17);\
2151
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2153
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2154
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155
}\
2156
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2157
    uint8_t full[24*17];\
2158
    uint8_t halfH[272];\
2159
    copy_block17(full, src, 24, stride, 17);\
2160
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2161
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2162
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163
}\
2164
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2165
    uint8_t full[24*17];\
2166
    uint8_t halfH[272];\
2167
    uint8_t halfV[256];\
2168
    uint8_t halfHV[256];\
2169
    copy_block17(full, src, 24, stride, 17);\
2170
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2172
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2173
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174
}\
2175
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2176
    uint8_t full[24*17];\
2177
    uint8_t halfH[272];\
2178
    copy_block17(full, src, 24, stride, 17);\
2179
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2181
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182
}\
2183
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2184
    uint8_t halfH[272];\
2185
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2186
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187
}
2188

    
2189
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2190
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2191
#define op_put(a, b) a = cm[((b) + 16)>>5]
2192
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193

    
2194
QPEL_MC(0, put_       , _       , op_put)
2195
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2196
QPEL_MC(0, avg_       , _       , op_avg)
2197
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2198
#undef op_avg
2199
#undef op_avg_no_rnd
2200
#undef op_put
2201
#undef op_put_no_rnd
2202

    
2203
#if 1
2204
#define H264_LOWPASS(OPNAME, OP, OP2) \
2205
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206
    const int h=2;\
2207
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208
    int i;\
2209
    for(i=0; i<h; i++)\
2210
    {\
2211
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213
        dst+=dstStride;\
2214
        src+=srcStride;\
2215
    }\
2216
}\
2217
\
2218
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219
    const int w=2;\
2220
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221
    int i;\
2222
    for(i=0; i<w; i++)\
2223
    {\
2224
        const int srcB= src[-2*srcStride];\
2225
        const int srcA= src[-1*srcStride];\
2226
        const int src0= src[0 *srcStride];\
2227
        const int src1= src[1 *srcStride];\
2228
        const int src2= src[2 *srcStride];\
2229
        const int src3= src[3 *srcStride];\
2230
        const int src4= src[4 *srcStride];\
2231
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2232
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2233
        dst++;\
2234
        src++;\
2235
    }\
2236
}\
2237
\
2238
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2239
    const int h=2;\
2240
    const int w=2;\
2241
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242
    int i;\
2243
    src -= 2*srcStride;\
2244
    for(i=0; i<h+5; i++)\
2245
    {\
2246
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2247
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2248
        tmp+=tmpStride;\
2249
        src+=srcStride;\
2250
    }\
2251
    tmp -= tmpStride*(h+5-2);\
2252
    for(i=0; i<w; i++)\
2253
    {\
2254
        const int tmpB= tmp[-2*tmpStride];\
2255
        const int tmpA= tmp[-1*tmpStride];\
2256
        const int tmp0= tmp[0 *tmpStride];\
2257
        const int tmp1= tmp[1 *tmpStride];\
2258
        const int tmp2= tmp[2 *tmpStride];\
2259
        const int tmp3= tmp[3 *tmpStride];\
2260
        const int tmp4= tmp[4 *tmpStride];\
2261
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2262
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2263
        dst++;\
2264
        tmp++;\
2265
    }\
2266
}\
2267
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2268
    const int h=4;\
2269
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270
    int i;\
2271
    for(i=0; i<h; i++)\
2272
    {\
2273
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2274
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2275
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2276
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2277
        dst+=dstStride;\
2278
        src+=srcStride;\
2279
    }\
2280
}\
2281
\
2282
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283
    const int w=4;\
2284
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2285
    int i;\
2286
    for(i=0; i<w; i++)\
2287
    {\
2288
        const int srcB= src[-2*srcStride];\
2289
        const int srcA= src[-1*srcStride];\
2290
        const int src0= src[0 *srcStride];\
2291
        const int src1= src[1 *srcStride];\
2292
        const int src2= src[2 *srcStride];\
2293
        const int src3= src[3 *srcStride];\
2294
        const int src4= src[4 *srcStride];\
2295
        const int src5= src[5 *srcStride];\
2296
        const int src6= src[6 *srcStride];\
2297
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2298
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2299
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2300
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2301
        dst++;\
2302
        src++;\
2303
    }\
2304
}\
2305
\
2306
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2307
    const int h=4;\
2308
    const int w=4;\
2309
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310
    int i;\
2311
    src -= 2*srcStride;\
2312
    for(i=0; i<h+5; i++)\
2313
    {\
2314
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2315
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2316
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2317
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2318
        tmp+=tmpStride;\
2319
        src+=srcStride;\
2320
    }\
2321
    tmp -= tmpStride*(h+5-2);\
2322
    for(i=0; i<w; i++)\
2323
    {\
2324
        const int tmpB= tmp[-2*tmpStride];\
2325
        const int tmpA= tmp[-1*tmpStride];\
2326
        const int tmp0= tmp[0 *tmpStride];\
2327
        const int tmp1= tmp[1 *tmpStride];\
2328
        const int tmp2= tmp[2 *tmpStride];\
2329
        const int tmp3= tmp[3 *tmpStride];\
2330
        const int tmp4= tmp[4 *tmpStride];\
2331
        const int tmp5= tmp[5 *tmpStride];\
2332
        const int tmp6= tmp[6 *tmpStride];\
2333
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2334
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2335
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2336
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2337
        dst++;\
2338
        tmp++;\
2339
    }\
2340
}\
2341
\
2342
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2343
    const int h=8;\
2344
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2345
    int i;\
2346
    for(i=0; i<h; i++)\
2347
    {\
2348
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2349
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2350
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2351
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2352
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2353
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2354
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2355
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2356
        dst+=dstStride;\
2357
        src+=srcStride;\
2358
    }\
2359
}\
2360
\
2361
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2362
    const int w=8;\
2363
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2364
    int i;\
2365
    for(i=0; i<w; i++)\
2366
    {\
2367
        const int srcB= src[-2*srcStride];\
2368
        const int srcA= src[-1*srcStride];\
2369
        const int src0= src[0 *srcStride];\
2370
        const int src1= src[1 *srcStride];\
2371
        const int src2= src[2 *srcStride];\
2372
        const int src3= src[3 *srcStride];\
2373
        const int src4= src[4 *srcStride];\
2374
        const int src5= src[5 *srcStride];\
2375
        const int src6= src[6 *srcStride];\
2376
        const int src7= src[7 *srcStride];\
2377
        const int src8= src[8 *srcStride];\
2378
        const int src9= src[9 *srcStride];\
2379
        const int src10=src[10*srcStride];\
2380
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2381
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2382
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2383
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2384
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2385
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2386
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2387
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2388
        dst++;\
2389
        src++;\
2390
    }\
2391
}\
2392
\
2393
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2394
    const int h=8;\
2395
    const int w=8;\
2396
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2397
    int i;\
2398
    src -= 2*srcStride;\
2399
    for(i=0; i<h+5; i++)\
2400
    {\
2401
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2402
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2403
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2404
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2405
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2406
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2407
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2408
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2409
        tmp+=tmpStride;\
2410
        src+=srcStride;\
2411
    }\
2412
    tmp -= tmpStride*(h+5-2);\
2413
    for(i=0; i<w; i++)\
2414
    {\
2415
        const int tmpB= tmp[-2*tmpStride];\
2416
        const int tmpA= tmp[-1*tmpStride];\
2417
        const int tmp0= tmp[0 *tmpStride];\
2418
        const int tmp1= tmp[1 *tmpStride];\
2419
        const int tmp2= tmp[2 *tmpStride];\
2420
        const int tmp3= tmp[3 *tmpStride];\
2421
        const int tmp4= tmp[4 *tmpStride];\
2422
        const int tmp5= tmp[5 *tmpStride];\
2423
        const int tmp6= tmp[6 *tmpStride];\
2424
        const int tmp7= tmp[7 *tmpStride];\
2425
        const int tmp8= tmp[8 *tmpStride];\
2426
        const int tmp9= tmp[9 *tmpStride];\
2427
        const int tmp10=tmp[10*tmpStride];\
2428
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2429
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2430
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2431
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2432
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2433
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2434
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2435
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2436
        dst++;\
2437
        tmp++;\
2438
    }\
2439
}\
2440
\
2441
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2443
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444
    src += 8*srcStride;\
2445
    dst += 8*dstStride;\
2446
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2447
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2448
}\
2449
\
2450
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2452
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453
    src += 8*srcStride;\
2454
    dst += 8*dstStride;\
2455
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2456
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2457
}\
2458
\
2459
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2460
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2461
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462
    src += 8*srcStride;\
2463
    dst += 8*dstStride;\
2464
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2465
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2466
}\
2467

    
2468
#define H264_MC(OPNAME, SIZE) \
2469
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2470
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2471
}\
2472
\
2473
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2474
    uint8_t half[SIZE*SIZE];\
2475
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2476
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2477
}\
2478
\
2479
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2480
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2481
}\
2482
\
2483
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2484
    uint8_t half[SIZE*SIZE];\
2485
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2486
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2487
}\
2488
\
2489
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2490
    uint8_t full[SIZE*(SIZE+5)];\
2491
    uint8_t * const full_mid= full + SIZE*2;\
2492
    uint8_t half[SIZE*SIZE];\
2493
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2495
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2496
}\
2497
\
2498
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2499
    uint8_t full[SIZE*(SIZE+5)];\
2500
    uint8_t * const full_mid= full + SIZE*2;\
2501
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2502
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2503
}\
2504
\
2505
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2506
    uint8_t full[SIZE*(SIZE+5)];\
2507
    uint8_t * const full_mid= full + SIZE*2;\
2508
    uint8_t half[SIZE*SIZE];\
2509
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2510
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2511
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2512
}\
2513
\
2514
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2515
    uint8_t full[SIZE*(SIZE+5)];\
2516
    uint8_t * const full_mid= full + SIZE*2;\
2517
    uint8_t halfH[SIZE*SIZE];\
2518
    uint8_t halfV[SIZE*SIZE];\
2519
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2520
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2521
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2522
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2523
}\
2524
\
2525
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2526
    uint8_t full[SIZE*(SIZE+5)];\
2527
    uint8_t * const full_mid= full + SIZE*2;\
2528
    uint8_t halfH[SIZE*SIZE];\
2529
    uint8_t halfV[SIZE*SIZE];\
2530
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2531
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2532
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2533
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2534
}\
2535
\
2536
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2537
    uint8_t full[SIZE*(SIZE+5)];\
2538
    uint8_t * const full_mid= full + SIZE*2;\
2539
    uint8_t halfH[SIZE*SIZE];\
2540
    uint8_t halfV[SIZE*SIZE];\
2541
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2542
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2543
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2544
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2545
}\
2546
\
2547
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2548
    uint8_t full[SIZE*(SIZE+5)];\
2549
    uint8_t * const full_mid= full + SIZE*2;\
2550
    uint8_t halfH[SIZE*SIZE];\
2551
    uint8_t halfV[SIZE*SIZE];\
2552
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2553
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2554
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2555
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2556
}\
2557
\
2558
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2559
    int16_t tmp[SIZE*(SIZE+5)];\
2560
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2561
}\
2562
\
2563
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2564
    int16_t tmp[SIZE*(SIZE+5)];\
2565
    uint8_t halfH[SIZE*SIZE];\
2566
    uint8_t halfHV[SIZE*SIZE];\
2567
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2568
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2570
}\
2571
\
2572
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2573
    int16_t tmp[SIZE*(SIZE+5)];\
2574
    uint8_t halfH[SIZE*SIZE];\
2575
    uint8_t halfHV[SIZE*SIZE];\
2576
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2578
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2579
}\
2580
\
2581
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2582
    uint8_t full[SIZE*(SIZE+5)];\
2583
    uint8_t * const full_mid= full + SIZE*2;\
2584
    int16_t tmp[SIZE*(SIZE+5)];\
2585
    uint8_t halfV[SIZE*SIZE];\
2586
    uint8_t halfHV[SIZE*SIZE];\
2587
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2588
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2589
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2590
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2591
}\
2592
\
2593
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2594
    uint8_t full[SIZE*(SIZE+5)];\
2595
    uint8_t * const full_mid= full + SIZE*2;\
2596
    int16_t tmp[SIZE*(SIZE+5)];\
2597
    uint8_t halfV[SIZE*SIZE];\
2598
    uint8_t halfHV[SIZE*SIZE];\
2599
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2600
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2601
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2603
}\
2604

    
2605
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2606
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2607
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2608
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2609
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2610

    
2611
H264_LOWPASS(put_       , op_put, op2_put)
2612
H264_LOWPASS(avg_       , op_avg, op2_avg)
2613
H264_MC(put_, 2)
2614
H264_MC(put_, 4)
2615
H264_MC(put_, 8)
2616
H264_MC(put_, 16)
2617
H264_MC(avg_, 4)
2618
H264_MC(avg_, 8)
2619
H264_MC(avg_, 16)
2620

    
2621
#undef op_avg
2622
#undef op_put
2623
#undef op2_avg
2624
#undef op2_put
2625
#endif
2626

    
2627
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2628
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2629
#define H264_WEIGHT(W,H) \
2630
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2631
    int y; \
2632
    offset <<= log2_denom; \
2633
    if(log2_denom) offset += 1<<(log2_denom-1); \
2634
    for(y=0; y<H; y++, block += stride){ \
2635
        op_scale1(0); \
2636
        op_scale1(1); \
2637
        if(W==2) continue; \
2638
        op_scale1(2); \
2639
        op_scale1(3); \
2640
        if(W==4) continue; \
2641
        op_scale1(4); \
2642
        op_scale1(5); \
2643
        op_scale1(6); \
2644
        op_scale1(7); \
2645
        if(W==8) continue; \
2646
        op_scale1(8); \
2647
        op_scale1(9); \
2648
        op_scale1(10); \
2649
        op_scale1(11); \
2650
        op_scale1(12); \
2651
        op_scale1(13); \
2652
        op_scale1(14); \
2653
        op_scale1(15); \
2654
    } \
2655
} \
2656
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2657
    int y; \
2658
    offset = ((offset + 1) | 1) << log2_denom; \
2659
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2660
        op_scale2(0); \
2661
        op_scale2(1); \
2662
        if(W==2) continue; \
2663
        op_scale2(2); \
2664
        op_scale2(3); \
2665
        if(W==4) continue; \
2666
        op_scale2(4); \
2667
        op_scale2(5); \
2668
        op_scale2(6); \
2669
        op_scale2(7); \
2670
        if(W==8) continue; \
2671
        op_scale2(8); \
2672
        op_scale2(9); \
2673
        op_scale2(10); \
2674
        op_scale2(11); \
2675
        op_scale2(12); \
2676
        op_scale2(13); \
2677
        op_scale2(14); \
2678
        op_scale2(15); \
2679
    } \
2680
}
2681

    
2682
H264_WEIGHT(16,16)
2683
H264_WEIGHT(16,8)
2684
H264_WEIGHT(8,16)
2685
H264_WEIGHT(8,8)
2686
H264_WEIGHT(8,4)
2687
H264_WEIGHT(4,8)
2688
H264_WEIGHT(4,4)
2689
H264_WEIGHT(4,2)
2690
H264_WEIGHT(2,4)
2691
H264_WEIGHT(2,2)
2692

    
2693
#undef op_scale1
2694
#undef op_scale2
2695
#undef H264_WEIGHT
2696

    
2697
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2698
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2699
    int i;
2700

    
2701
    for(i=0; i<h; i++){
2702
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2703
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2704
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2705
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2706
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2707
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2708
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2709
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2710
        dst+=dstStride;
2711
        src+=srcStride;
2712
    }
2713
}
2714

    
2715
#if CONFIG_CAVS_DECODER
2716
/* AVS specific */
2717
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718

    
2719
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720
    put_pixels8_c(dst, src, stride, 8);
2721
}
2722
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2723
    avg_pixels8_c(dst, src, stride, 8);
2724
}
2725
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2726
    put_pixels16_c(dst, src, stride, 16);
2727
}
2728
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2729
    avg_pixels16_c(dst, src, stride, 16);
2730
}
2731
#endif /* CONFIG_CAVS_DECODER */
2732

    
2733
#if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2734
/* VC-1 specific */
2735
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736

    
2737
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2738
    put_pixels8_c(dst, src, stride, 8);
2739
}
2740
#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741

    
2742
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2743

    
2744
/* H264 specific */
2745
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746

    
2747
#if CONFIG_RV30_DECODER
2748
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2749
#endif /* CONFIG_RV30_DECODER */
2750

    
2751
#if CONFIG_RV40_DECODER
2752
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2753
    put_pixels16_xy2_c(dst, src, stride, 16);
2754
}
2755
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2756
    avg_pixels16_xy2_c(dst, src, stride, 16);
2757
}
2758
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2759
    put_pixels8_xy2_c(dst, src, stride, 8);
2760
}
2761
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2762
    avg_pixels8_xy2_c(dst, src, stride, 8);
2763
}
2764

    
2765
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2766
#endif /* CONFIG_RV40_DECODER */
2767

    
2768
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2769
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2770
    int i;
2771

    
2772
    for(i=0; i<w; i++){
2773
        const int src_1= src[ -srcStride];
2774
        const int src0 = src[0          ];
2775
        const int src1 = src[  srcStride];
2776
        const int src2 = src[2*srcStride];
2777
        const int src3 = src[3*srcStride];
2778
        const int src4 = src[4*srcStride];
2779
        const int src5 = src[5*srcStride];
2780
        const int src6 = src[6*srcStride];
2781
        const int src7 = src[7*srcStride];
2782
        const int src8 = src[8*srcStride];
2783
        const int src9 = src[9*srcStride];
2784
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2785
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2786
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2787
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2788
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2789
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2790
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2791
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2792
        src++;
2793
        dst++;
2794
    }
2795
}
2796

    
2797
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2798
    put_pixels8_c(dst, src, stride, 8);
2799
}
2800

    
2801
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2802
    uint8_t half[64];
2803
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2804
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2805
}
2806

    
2807
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2808
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2809
}
2810

    
2811
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2812
    uint8_t half[64];
2813
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2814
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2815
}
2816

    
2817
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2818
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2819
}
2820

    
2821
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2822
    uint8_t halfH[88];
2823
    uint8_t halfV[64];
2824
    uint8_t halfHV[64];
2825
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2826
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2827
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2828
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2829
}
2830
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2831
    uint8_t halfH[88];
2832
    uint8_t halfV[64];
2833
    uint8_t halfHV[64];
2834
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2835
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2836
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2837
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2838
}
2839
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2840
    uint8_t halfH[88];
2841
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2842
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2843
}
2844

    
2845
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2846
    if(CONFIG_ANY_H263) {
2847
    int x;
2848
    const int strength= ff_h263_loop_filter_strength[qscale];
2849

    
2850
    for(x=0; x<8; x++){
2851
        int d1, d2, ad1;
2852
        int p0= src[x-2*stride];
2853
        int p1= src[x-1*stride];
2854
        int p2= src[x+0*stride];
2855
        int p3= src[x+1*stride];
2856
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2857

    
2858
        if     (d<-2*strength) d1= 0;
2859
        else if(d<-  strength) d1=-2*strength - d;
2860
        else if(d<   strength) d1= d;
2861
        else if(d< 2*strength) d1= 2*strength - d;
2862
        else                   d1= 0;
2863

    
2864
        p1 += d1;
2865
        p2 -= d1;
2866
        if(p1&256) p1= ~(p1>>31);
2867
        if(p2&256) p2= ~(p2>>31);
2868

    
2869
        src[x-1*stride] = p1;
2870
        src[x+0*stride] = p2;
2871

    
2872
        ad1= FFABS(d1)>>1;
2873

    
2874
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2875

    
2876
        src[x-2*stride] = p0 - d2;
2877
        src[x+  stride] = p3 + d2;
2878
    }
2879
    }
2880
}
2881

    
2882
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2883
    if(CONFIG_ANY_H263) {
2884
    int y;
2885
    const int strength= ff_h263_loop_filter_strength[qscale];
2886

    
2887
    for(y=0; y<8; y++){
2888
        int d1, d2, ad1;
2889
        int p0= src[y*stride-2];
2890
        int p1= src[y*stride-1];
2891
        int p2= src[y*stride+0];
2892
        int p3= src[y*stride+1];
2893
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2894

    
2895
        if     (d<-2*strength) d1= 0;
2896
        else if(d<-  strength) d1=-2*strength - d;
2897
        else if(d<   strength) d1= d;
2898
        else if(d< 2*strength) d1= 2*strength - d;
2899
        else                   d1= 0;
2900

    
2901
        p1 += d1;
2902
        p2 -= d1;
2903
        if(p1&256) p1= ~(p1>>31);
2904
        if(p2&256) p2= ~(p2>>31);
2905

    
2906
        src[y*stride-1] = p1;
2907
        src[y*stride+0] = p2;
2908

    
2909
        ad1= FFABS(d1)>>1;
2910

    
2911
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2912

    
2913
        src[y*stride-2] = p0 - d2;
2914
        src[y*stride+1] = p3 + d2;
2915
    }
2916
    }
2917
}
2918

    
2919
static void h261_loop_filter_c(uint8_t *src, int stride){
2920
    int x,y,xy,yz;
2921
    int temp[64];
2922

    
2923
    for(x=0; x<8; x++){
2924
        temp[x      ] = 4*src[x           ];
2925
        temp[x + 7*8] = 4*src[x + 7*stride];
2926
    }
2927
    for(y=1; y<7; y++){
2928
        for(x=0; x<8; x++){
2929
            xy = y * stride + x;
2930
            yz = y * 8 + x;
2931
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2932
        }
2933
    }
2934

    
2935
    for(y=0; y<8; y++){
2936
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2937
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2938
        for(x=1; x<7; x++){
2939
            xy = y * stride + x;
2940
            yz = y * 8 + x;
2941
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2942
        }
2943
    }
2944
}
2945

    
2946
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2947
{
2948
    int i, d;
2949
    for( i = 0; i < 4; i++ ) {
2950
        if( tc0[i] < 0 ) {
2951
            pix += 4*ystride;
2952
            continue;
2953
        }
2954
        for( d = 0; d < 4; d++ ) {
2955
            const int p0 = pix[-1*xstride];
2956
            const int p1 = pix[-2*xstride];
2957
            const int p2 = pix[-3*xstride];
2958
            const int q0 = pix[0];
2959
            const int q1 = pix[1*xstride];
2960
            const int q2 = pix[2*xstride];
2961

    
2962
            if( FFABS( p0 - q0 ) < alpha &&
2963
                FFABS( p1 - p0 ) < beta &&
2964
                FFABS( q1 - q0 ) < beta ) {
2965

    
2966
                int tc = tc0[i];
2967
                int i_delta;
2968

    
2969
                if( FFABS( p2 - p0 ) < beta ) {
2970
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2971
                    tc++;
2972
                }
2973
                if( FFABS( q2 - q0 ) < beta ) {
2974
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2975
                    tc++;
2976
                }
2977

    
2978
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2979
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2980
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2981
            }
2982
            pix += ystride;
2983
        }
2984
    }
2985
}
2986
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2987
{
2988
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2989
}
2990
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2991
{
2992
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2993
}
2994

    
2995
static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2996
{
2997
    int d;
2998
    for( d = 0; d < 16; d++ ) {
2999
        const int p2 = pix[-3*xstride];
3000
        const int p1 = pix[-2*xstride];
3001
        const int p0 = pix[-1*xstride];
3002

    
3003
        const int q0 = pix[ 0*xstride];
3004
        const int q1 = pix[ 1*xstride];
3005
        const int q2 = pix[ 2*xstride];
3006

    
3007
        if( FFABS( p0 - q0 ) < alpha &&
3008
            FFABS( p1 - p0 ) < beta &&
3009
            FFABS( q1 - q0 ) < beta ) {
3010

    
3011
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3012
                if( FFABS( p2 - p0 ) < beta)
3013
                {
3014
                    const int p3 = pix[-4*xstride];
3015
                    /* p0', p1', p2' */
3016
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3017
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3018
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3019
                } else {
3020
                    /* p0' */
3021
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3022
                }
3023
                if( FFABS( q2 - q0 ) < beta)
3024
                {
3025
                    const int q3 = pix[3*xstride];
3026
                    /* q0', q1', q2' */
3027
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3028
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3029
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3030
                } else {
3031
                    /* q0' */
3032
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3033
                }
3034
            }else{
3035
                /* p0', q0' */
3036
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3037
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3038
            }
3039
        }
3040
        pix += ystride;
3041
    }
3042
}
3043
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3044
{
3045
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3046
}
3047
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3048
{
3049
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3050
}
3051

    
3052
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3053
{
3054
    int i, d;
3055
    for( i = 0; i < 4; i++ ) {
3056
        const int tc = tc0[i];
3057
        if( tc <= 0 ) {
3058
            pix += 2*ystride;
3059
            continue;
3060
        }
3061
        for( d = 0; d < 2; d++ ) {
3062
            const int p0 = pix[-1*xstride];
3063
            const int p1 = pix[-2*xstride];
3064
            const int q0 = pix[0];
3065
            const int q1 = pix[1*xstride];
3066

    
3067
            if( FFABS( p0 - q0 ) < alpha &&
3068
                FFABS( p1 - p0 ) < beta &&
3069
                FFABS( q1 - q0 ) < beta ) {
3070

    
3071
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3072

    
3073
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3074
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3075
            }
3076
            pix += ystride;
3077
        }
3078
    }
3079
}
3080
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3081
{
3082
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3083
}
3084
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3085
{
3086
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3087
}
3088

    
3089
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3090
{
3091
    int d;
3092
    for( d = 0; d < 8; d++ ) {
3093
        const int p0 = pix[-1*xstride];
3094
        const int p1 = pix[-2*xstride];
3095
        const int q0 = pix[0];
3096
        const int q1 = pix[1*xstride];
3097

    
3098
        if( FFABS( p0 - q0 ) < alpha &&
3099
            FFABS( p1 - p0 ) < beta &&
3100
            FFABS( q1 - q0 ) < beta ) {
3101

    
3102
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3103
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3104
        }
3105
        pix += ystride;
3106
    }
3107
}
3108
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3109
{
3110
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3111
}
3112
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3113
{
3114
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3115
}
3116

    
3117
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3118
{
3119
    int s, i;
3120

    
3121
    s = 0;
3122
    for(i=0;i<h;i++) {
3123
        s += abs(pix1[0] - pix2[0]);
3124
        s += abs(pix1[1] - pix2[1]);
3125
        s += abs(pix1[2] - pix2[2]);
3126
        s += abs(pix1[3] - pix2[3]);
3127
        s += abs(pix1[4] - pix2[4]);
3128
        s += abs(pix1[5] - pix2[5]);
3129
        s += abs(pix1[6] - pix2[6]);
3130
        s += abs(pix1[7] - pix2[7]);
3131
        s += abs(pix1[8] - pix2[8]);
3132
        s += abs(pix1[9] - pix2[9]);
3133
        s += abs(pix1[10] - pix2[10]);
3134
        s += abs(pix1[11] - pix2[11]);
3135
        s += abs(pix1[12] - pix2[12]);
3136
        s += abs(pix1[13] - pix2[13]);
3137
        s += abs(pix1[14] - pix2[14]);
3138
        s += abs(pix1[15] - pix2[15]);
3139
        pix1 += line_size;
3140
        pix2 += line_size;
3141
    }
3142
    return s;
3143
}
3144

    
3145
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3146
{
3147
    int s, i;
3148

    
3149
    s = 0;
3150
    for(i=0;i<h;i++) {
3151
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3152
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3153
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3154
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3155
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3156
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3157
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3158
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3159
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3160
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3161
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3162
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3163
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3164
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3165
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3166
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3167
        pix1 += line_size;
3168
        pix2 += line_size;
3169
    }
3170
    return s;
3171
}
3172

    
3173
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3174
{
3175
    int s, i;
3176
    uint8_t *pix3 = pix2 + line_size;
3177

    
3178
    s = 0;
3179
    for(i=0;i<h;i++) {
3180
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3181
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3182
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3183
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3184
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3185
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3186
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3187
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3188
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3189
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3190
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3191
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3192
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3193
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3194
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3195
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3196
        pix1 += line_size;
3197
        pix2 += line_size;
3198
        pix3 += line_size;
3199
    }
3200
    return s;
3201
}
3202

    
3203
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3204
{
3205
    int s, i;
3206
    uint8_t *pix3 = pix2 + line_size;
3207

    
3208
    s = 0;
3209
    for(i=0;i<h;i++) {
3210
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3211
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3212
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3213
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3214
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3215
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3216
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3217
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3218
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3219
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3220
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3221
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3222
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3223
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3224
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3225
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3226
        pix1 += line_size;
3227
        pix2 += line_size;
3228
        pix3 += line_size;
3229
    }
3230
    return s;
3231
}
3232

    
3233
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234
{
3235
    int s, i;
3236

    
3237
    s = 0;
3238
    for(i=0;i<h;i++) {
3239
        s += abs(pix1[0] - pix2[0]);
3240
        s += abs(pix1[1] - pix2[1]);
3241
        s += abs(pix1[2] - pix2[2]);
3242
        s += abs(pix1[3] - pix2[3]);
3243
        s += abs(pix1[4] - pix2[4]);
3244
        s += abs(pix1[5] - pix2[5]);
3245
        s += abs(pix1[6] - pix2[6]);
3246
        s += abs(pix1[7] - pix2[7]);
3247
        pix1 += line_size;
3248
        pix2 += line_size;
3249
    }
3250
    return s;
3251
}
3252

    
3253
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3254
{
3255
    int s, i;
3256

    
3257
    s = 0;
3258
    for(i=0;i<h;i++) {
3259
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3260
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3261
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3262
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3263
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3264
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3265
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3266
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3267
        pix1 += line_size;
3268
        pix2 += line_size;
3269
    }
3270
    return s;
3271
}
3272

    
3273
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3274
{
3275
    int s, i;
3276
    uint8_t *pix3 = pix2 + line_size;
3277

    
3278
    s = 0;
3279
    for(i=0;i<h;i++) {
3280
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3281
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3282
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3283
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3284
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3285
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3286
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3287
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3288
        pix1 += line_size;
3289
        pix2 += line_size;
3290
        pix3 += line_size;
3291
    }
3292
    return s;
3293
}
3294

    
3295
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3296
{
3297
    int s, i;
3298
    uint8_t *pix3 = pix2 + line_size;
3299

    
3300
    s = 0;
3301
    for(i=0;i<h;i++) {
3302
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3303
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3304
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3305
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3306
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3307
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3308
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3309
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3310
        pix1 += line_size;
3311
        pix2 += line_size;
3312
        pix3 += line_size;
3313
    }
3314
    return s;
3315
}
3316

    
3317
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3318
    MpegEncContext *c = v;
3319
    int score1=0;
3320
    int score2=0;
3321
    int x,y;
3322

    
3323
    for(y=0; y<h; y++){
3324
        for(x=0; x<16; x++){
3325
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3326
        }
3327
        if(y+1<h){
3328
            for(x=0; x<15; x++){
3329
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3330
                             - s1[x+1] + s1[x+1+stride])
3331
                        -FFABS(  s2[x  ] - s2[x  +stride]
3332
                             - s2[x+1] + s2[x+1+stride]);
3333
            }
3334
        }
3335
        s1+= stride;
3336
        s2+= stride;
3337
    }
3338

    
3339
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3340
    else  return score1 + FFABS(score2)*8;
3341
}
3342

    
3343
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3344
    MpegEncContext *c = v;
3345
    int score1=0;
3346
    int score2=0;
3347
    int x,y;
3348

    
3349
    for(y=0; y<h; y++){
3350
        for(x=0; x<8; x++){
3351
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3352
        }
3353
        if(y+1<h){
3354
            for(x=0; x<7; x++){
3355
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3356
                             - s1[x+1] + s1[x+1+stride])
3357
                        -FFABS(  s2[x  ] - s2[x  +stride]
3358
                             - s2[x+1] + s2[x+1+stride]);
3359
            }
3360
        }
3361
        s1+= stride;
3362
        s2+= stride;
3363
    }
3364

    
3365
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3366
    else  return score1 + FFABS(score2)*8;
3367
}
3368

    
3369
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3370
    int i;
3371
    unsigned int sum=0;
3372

    
3373
    for(i=0; i<8*8; i++){
3374
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3375
        int w= weight[i];
3376
        b>>= RECON_SHIFT;
3377
        assert(-512<b && b<512);
3378

    
3379
        sum += (w*b)*(w*b)>>4;
3380
    }
3381
    return sum>>2;
3382
}
3383

    
3384
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3385
    int i;
3386

    
3387
    for(i=0; i<8*8; i++){
3388
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3389
    }
3390
}
3391

    
3392
/**
3393
 * permutes an 8x8 block.
3394
 * @param block the block which will be permuted according to the given permutation vector
3395
 * @param permutation the permutation vector
3396
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3397
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3398
 *                  (inverse) permutated to scantable order!
3399
 */
3400
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3401
{
3402
    int i;
3403
    DCTELEM temp[64];
3404

    
3405
    if(last<=0) return;
3406
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3407

    
3408
    for(i=0; i<=last; i++){
3409
        const int j= scantable[i];
3410
        temp[j]= block[j];
3411
        block[j]=0;
3412
    }
3413

    
3414
    for(i=0; i<=last; i++){
3415
        const int j= scantable[i];
3416
        const int perm_j= permutation[j];
3417
        block[perm_j]= temp[j];
3418
    }
3419
}
3420

    
3421
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3422
    return 0;
3423
}
3424

    
3425
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3426
    int i;
3427

    
3428
    memset(cmp, 0, sizeof(void*)*5);
3429

    
3430
    for(i=0; i<5; i++){
3431
        switch(type&0xFF){
3432
        case FF_CMP_SAD:
3433
            cmp[i]= c->sad[i];
3434
            break;
3435
        case FF_CMP_SATD:
3436
            cmp[i]= c->hadamard8_diff[i];
3437
            break;
3438
        case FF_CMP_SSE:
3439
            cmp[i]= c->sse[i];
3440
            break;
3441
        case FF_CMP_DCT:
3442
            cmp[i]= c->dct_sad[i];
3443
            break;
3444
        case FF_CMP_DCT264:
3445
            cmp[i]= c->dct264_sad[i];
3446
            break;
3447
        case FF_CMP_DCTMAX:
3448
            cmp[i]= c->dct_max[i];
3449
            break;
3450
        case FF_CMP_PSNR:
3451
            cmp[i]= c->quant_psnr[i];
3452
            break;
3453
        case FF_CMP_BIT:
3454
            cmp[i]= c->bit[i];
3455
            break;
3456
        case FF_CMP_RD:
3457
            cmp[i]= c->rd[i];
3458
            break;
3459
        case FF_CMP_VSAD:
3460
            cmp[i]= c->vsad[i];
3461
            break;
3462
        case FF_CMP_VSSE:
3463
            cmp[i]= c->vsse[i];
3464
            break;
3465
        case FF_CMP_ZERO:
3466
            cmp[i]= zero_cmp;
3467
            break;
3468
        case FF_CMP_NSSE:
3469
            cmp[i]= c->nsse[i];
3470
            break;
3471
#if CONFIG_SNOW_ENCODER
3472
        case FF_CMP_W53:
3473
            cmp[i]= c->w53[i];
3474
            break;
3475
        case FF_CMP_W97:
3476
            cmp[i]= c->w97[i];
3477
            break;
3478
#endif
3479
        default:
3480
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3481
        }
3482
    }
3483
}
3484

    
3485
static void clear_block_c(DCTELEM *block)
3486
{
3487
    memset(block, 0, sizeof(DCTELEM)*64);
3488
}
3489

    
3490
/**
3491
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3492
 */
3493
static void clear_blocks_c(DCTELEM *blocks)
3494
{
3495
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3496
}
3497

    
3498
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3499
    long i;
3500
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3501
        long a = *(long*)(src+i);
3502
        long b = *(long*)(dst+i);
3503
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3504
    }
3505
    for(; i<w; i++)
3506
        dst[i+0] += src[i+0];
3507
}
3508

    
3509
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3510
    long i;
3511
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3512
        long a = *(long*)(src1+i);
3513
        long b = *(long*)(src2+i);
3514
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3515
    }
3516
    for(; i<w; i++)
3517
        dst[i] = src1[i]+src2[i];
3518
}
3519

    
3520
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3521
    long i;
3522
#if !HAVE_FAST_UNALIGNED
3523
    if((long)src2 & (sizeof(long)-1)){
3524
        for(i=0; i+7<w; i+=8){
3525
            dst[i+0] = src1[i+0]-src2[i+0];
3526
            dst[i+1] = src1[i+1]-src2[i+1];
3527
            dst[i+2] = src1[i+2]-src2[i+2];
3528
            dst[i+3] = src1[i+3]-src2[i+3];
3529
            dst[i+4] = src1[i+4]-src2[i+4];
3530
            dst[i+5] = src1[i+5]-src2[i+5];
3531
            dst[i+6] = src1[i+6]-src2[i+6];
3532
            dst[i+7] = src1[i+7]-src2[i+7];
3533
        }
3534
    }else
3535
#endif
3536
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3537
        long a = *(long*)(src1+i);
3538
        long b = *(long*)(src2+i);
3539
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3540
    }
3541
    for(; i<w; i++)
3542
        dst[i+0] = src1[i+0]-src2[i+0];
3543
}
3544

    
3545
static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3546
    int i;
3547
    uint8_t l, lt;
3548

    
3549
    l= *left;
3550
    lt= *left_top;
3551

    
3552
    for(i=0; i<w; i++){
3553
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3554
        lt= src1[i];
3555
        l= src2[i];
3556
        dst[i]= l - pred;
3557
    }
3558

    
3559
    *left= l;
3560
    *left_top= lt;
3561
}
3562

    
3563
#define BUTTERFLY2(o1,o2,i1,i2) \
3564
o1= (i1)+(i2);\
3565
o2= (i1)-(i2);
3566

    
3567
#define BUTTERFLY1(x,y) \
3568
{\
3569
    int a,b;\
3570
    a= x;\
3571
    b= y;\
3572
    x= a+b;\
3573
    y= a-b;\
3574
}
3575

    
3576
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3577

    
3578
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3579
    int i;
3580
    int temp[64];
3581
    int sum=0;
3582

    
3583
    assert(h==8);
3584

    
3585
    for(i=0; i<8; i++){
3586
        //FIXME try pointer walks
3587
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3588
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3589
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3590
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3591

    
3592
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3593
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3594
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3595
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3596

    
3597
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3598
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3599
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3600
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3601
    }
3602

    
3603
    for(i=0; i<8; i++){
3604
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3605
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3606
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3607
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3608

    
3609
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3610
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3611
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3612
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3613

    
3614
        sum +=
3615
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3616
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3617
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3618
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3619
    }
3620
#if 0
3621
static int maxi=0;
3622
if(sum>maxi){
3623
    maxi=sum;
3624
    printf("MAX:%d\n", maxi);
3625
}
3626
#endif
3627
    return sum;
3628
}
3629

    
3630
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3631
    int i;
3632
    int temp[64];
3633
    int sum=0;
3634

    
3635
    assert(h==8);
3636

    
3637
    for(i=0; i<8; i++){
3638
        //FIXME try pointer walks
3639
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3640
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3641
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3642
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3643

    
3644
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3645
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3646
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3647
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3648

    
3649
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3650
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3651
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3652
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3653
    }
3654

    
3655
    for(i=0; i<8; i++){
3656
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3657
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3658
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3659
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3660

    
3661
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3662
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3663
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3664
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3665

    
3666
        sum +=
3667
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3668
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3669
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3670
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3671
    }
3672

    
3673
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3674

    
3675
    return sum;
3676
}
3677

    
3678
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3679
    MpegEncContext * const s= (MpegEncContext *)c;
3680
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3681
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3682

    
3683
    assert(h==8);
3684

    
3685
    s->dsp.diff_pixels(temp, src1, src2, stride);
3686
    s->dsp.fdct(temp);
3687
    return s->dsp.sum_abs_dctelem(temp);
3688
}
3689

    
3690
#if CONFIG_GPL
3691
#define DCT8_1D {\
3692
    const int s07 = SRC(0) + SRC(7);\
3693
    const int s16 = SRC(1) + SRC(6);\
3694
    const int s25 = SRC(2) + SRC(5);\
3695
    const int s34 = SRC(3) + SRC(4);\
3696
    const int a0 = s07 + s34;\
3697
    const int a1 = s16 + s25;\
3698
    const int a2 = s07 - s34;\
3699
    const int a3 = s16 - s25;\
3700
    const int d07 = SRC(0) - SRC(7);\
3701
    const int d16 = SRC(1) - SRC(6);\
3702
    const int d25 = SRC(2) - SRC(5);\
3703
    const int d34 = SRC(3) - SRC(4);\
3704
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3705
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3706
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3707
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3708
    DST(0,  a0 + a1     ) ;\
3709
    DST(1,  a4 + (a7>>2)) ;\
3710
    DST(2,  a2 + (a3>>1)) ;\
3711
    DST(3,  a5 + (a6>>2)) ;\
3712
    DST(4,  a0 - a1     ) ;\
3713
    DST(5,  a6 - (a5>>2)) ;\
3714
    DST(6, (a2>>1) - a3 ) ;\
3715
    DST(7, (a4>>2) - a7 ) ;\
3716
}
3717

    
3718
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3719
    MpegEncContext * const s= (MpegEncContext *)c;
3720
    DCTELEM dct[8][8];
3721
    int i;
3722
    int sum=0;
3723

    
3724
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3725

    
3726
#define SRC(x) dct[i][x]
3727
#define DST(x,v) dct[i][x]= v
3728
    for( i = 0; i < 8; i++ )
3729
        DCT8_1D
3730
#undef SRC
3731
#undef DST
3732

    
3733
#define SRC(x) dct[x][i]
3734
#define DST(x,v) sum += FFABS(v)
3735
    for( i = 0; i < 8; i++ )
3736
        DCT8_1D
3737
#undef SRC
3738
#undef DST
3739
    return sum;
3740
}
3741
#endif
3742

    
3743
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3744
    MpegEncContext * const s= (MpegEncContext *)c;
3745
    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3746
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3747
    int sum=0, i;
3748

    
3749
    assert(h==8);
3750

    
3751
    s->dsp.diff_pixels(temp, src1, src2, stride);
3752
    s->dsp.fdct(temp);
3753

    
3754
    for(i=0; i<64; i++)
3755
        sum= FFMAX(sum, FFABS(temp[i]));
3756

    
3757
    return sum;
3758
}
3759

    
3760
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3761
    MpegEncContext * const s= (MpegEncContext *)c;
3762
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3763
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3764
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3765
    int sum=0, i;
3766

    
3767
    assert(h==8);
3768
    s->mb_intra=0;
3769

    
3770
    s->dsp.diff_pixels(temp, src1, src2, stride);
3771

    
3772
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3773

    
3774
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3775
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3776
    ff_simple_idct(temp); //FIXME
3777

    
3778
    for(i=0; i<64; i++)
3779
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3780

    
3781
    return sum;
3782
}
3783

    
3784
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3785
    MpegEncContext * const s= (MpegEncContext *)c;
3786
    const uint8_t *scantable= s->intra_scantable.permutated;
3787
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3788
    DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3789
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3790
    uint8_t * const bak= (uint8_t*)aligned_bak;
3791
    int i, last, run, bits, level, distortion, start_i;
3792
    const int esc_length= s->ac_esc_length;
3793
    uint8_t * length;
3794
    uint8_t * last_length;
3795

    
3796
    assert(h==8);
3797

    
3798
    for(i=0; i<8; i++){
3799
        ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3800
        ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3801
    }
3802

    
3803
    s->dsp.diff_pixels(temp, src1, src2, stride);
3804

    
3805
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3806

    
3807
    bits=0;
3808

    
3809
    if (s->mb_intra) {
3810
        start_i = 1;
3811
        length     = s->intra_ac_vlc_length;
3812
        last_length= s->intra_ac_vlc_last_length;
3813
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3814
    } else {
3815
        start_i = 0;
3816
        length     = s->inter_ac_vlc_length;
3817
        last_length= s->inter_ac_vlc_last_length;
3818
    }
3819

    
3820
    if(last>=start_i){
3821
        run=0;
3822
        for(i=start_i; i<last; i++){
3823
            int j= scantable[i];
3824
            level= temp[j];
3825

    
3826
            if(level){
3827
                level+=64;
3828
                if((level&(~127)) == 0){
3829
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3830
                }else
3831
                    bits+= esc_length;
3832
                run=0;
3833
            }else
3834
                run++;
3835
        }
3836
        i= scantable[last];
3837

    
3838
        level= temp[i] + 64;
3839

    
3840
        assert(level - 64);
3841

    
3842
        if((level&(~127)) == 0){
3843
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3844
        }else
3845
            bits+= esc_length;
3846

    
3847
    }
3848

    
3849
    if(last>=0){
3850
        if(s->mb_intra)
3851
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3852
        else
3853
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3854
    }
3855

    
3856
    s->dsp.idct_add(bak, stride, temp);
3857

    
3858
    distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3859

    
3860
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3861
}
3862

    
3863
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3864
    MpegEncContext * const s= (MpegEncContext *)c;
3865
    const uint8_t *scantable= s->intra_scantable.permutated;
3866
    DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3867
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3868
    int i, last, run, bits, level, start_i;
3869
    const int esc_length= s->ac_esc_length;
3870
    uint8_t * length;
3871
    uint8_t * last_length;
3872

    
3873
    assert(h==8);
3874

    
3875
    s->dsp.diff_pixels(temp, src1, src2, stride);
3876

    
3877
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3878

    
3879
    bits=0;
3880

    
3881
    if (s->mb_intra) {
3882
        start_i = 1;
3883
        length     = s->intra_ac_vlc_length;
3884
        last_length= s->intra_ac_vlc_last_length;
3885
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3886
    } else {
3887
        start_i = 0;
3888
        length     = s->inter_ac_vlc_length;
3889
        last_length= s->inter_ac_vlc_last_length;
3890
    }
3891

    
3892
    if(last>=start_i){
3893
        run=0;
3894
        for(i=start_i; i<last; i++){
3895
            int j= scantable[i];
3896
            level= temp[j];
3897

    
3898
            if(level){
3899
                level+=64;
3900
                if((level&(~127)) == 0){
3901
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3902
                }else
3903
                    bits+= esc_length;
3904
                run=0;
3905
            }else
3906
                run++;
3907
        }
3908
        i= scantable[last];
3909

    
3910
        level= temp[i] + 64;
3911

    
3912
        assert(level - 64);
3913

    
3914
        if((level&(~127)) == 0){
3915
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3916
        }else
3917
            bits+= esc_length;
3918
    }
3919

    
3920
    return bits;
3921
}
3922

    
3923
static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3924
    int score=0;
3925
    int x,y;
3926

    
3927
    for(y=1; y<h; y++){
3928
        for(x=0; x<16; x+=4){
3929
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3930
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3931
        }
3932
        s+= stride;
3933
    }
3934

    
3935
    return score;
3936
}
3937

    
3938
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3939
    int score=0;
3940
    int x,y;
3941

    
3942
    for(y=1; y<h; y++){
3943
        for(x=0; x<16; x++){
3944
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3945
        }
3946
        s1+= stride;
3947
        s2+= stride;
3948
    }
3949

    
3950
    return score;
3951
}
3952

    
3953
#define SQ(a) ((a)*(a))
3954
static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3955
    int score=0;
3956
    int x,y;
3957

    
3958
    for(y=1; y<h; y++){
3959
        for(x=0; x<16; x+=4){
3960
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3961
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3962
        }
3963
        s+= stride;
3964
    }
3965

    
3966
    return score;
3967
}
3968

    
3969
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3970
    int score=0;
3971
    int x,y;
3972

    
3973
    for(y=1; y<h; y++){
3974
        for(x=0; x<16; x++){
3975
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3976
        }
3977
        s1+= stride;
3978
        s2+= stride;
3979
    }
3980

    
3981
    return score;
3982
}
3983

    
3984
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3985
                               int size){
3986
    int score=0;
3987
    int i;
3988
    for(i=0; i<size; i++)
3989
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3990
    return score;
3991
}
3992

    
3993
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3994
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3995
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3996
#if CONFIG_GPL
3997
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3998
#endif
3999
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4000
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4001
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4002
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4003

    
4004
static void vector_fmul_c(float *dst, const float *src, int len){
4005
    int i;
4006
    for(i=0; i<len; i++)
4007
        dst[i] *= src[i];
4008
}
4009

    
4010
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4011
    int i;
4012
    src1 += len-1;
4013
    for(i=0; i<len; i++)
4014
        dst[i] = src0[i] * src1[-i];
4015
}
4016

    
4017
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4018
    int i;
4019
    for(i=0; i<len; i++)
4020
        dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4021
}
4022

    
4023
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4024
    int i,j;
4025
    dst += len;
4026
    win += len;
4027
    src0+= len;
4028
    for(i=-len, j=len-1; i<0; i++, j--) {
4029
        float s0 = src0[i];
4030
        float s1 = src1[j];
4031
        float wi = win[i];
4032
        float wj = win[j];
4033
        dst[i] = s0*wj - s1*wi + add_bias;
4034
        dst[j] = s0*wi + s1*wj + add_bias;
4035
    }
4036
}
4037

    
4038
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4039
    int i;
4040
    for(i=0; i<len; i++)
4041
        dst[i] = src[i] * mul;
4042
}
4043

    
4044
static av_always_inline int float_to_int16_one(const float *src){
4045
    int_fast32_t tmp = *(const int32_t*)src;
4046
    if(tmp & 0xf0000){
4047
        tmp = (0x43c0ffff - tmp)>>31;
4048
        // is this faster on some gcc/cpu combinations?
4049
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4050
//      else                 tmp = 0;
4051
    }
4052
    return tmp - 0x8000;
4053
}
4054

    
4055
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4056
    int i;
4057
    for(i=0; i<len; i++)
4058
        dst[i] = float_to_int16_one(src+i);
4059
}
4060

    
4061
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4062
    int i,j,c;
4063
    if(channels==2){
4064
        for(i=0; i<len; i++){
4065
            dst[2*i]   = float_to_int16_one(src[0]+i);
4066
            dst[2*i+1] = float_to_int16_one(src[1]+i);
4067
        }
4068
    }else{
4069
        for(c=0; c<channels; c++)
4070
            for(i=0, j=c; i<len; i++, j+=channels)
4071
                dst[j] = float_to_int16_one(src[c]+i);
4072
    }
4073
}
4074

    
4075
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4076
{
4077
    while (order--)
4078
       *v1++ += *v2++;
4079
}
4080

    
4081
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4082
{
4083
    while (order--)
4084
        *v1++ -= *v2++;
4085
}
4086

    
4087
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4088
{
4089
    int res = 0;
4090

    
4091
    while (order--)
4092
        res += (*v1++ * *v2++) >> shift;
4093

    
4094
    return res;
4095
}
4096

    
4097
#define W0 2048
4098
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4099
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4100
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4101
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4102
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4103
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4104
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4105

    
4106
static void wmv2_idct_row(short * b)
4107
{
4108
    int s1,s2;
4109
    int a0,a1,a2,a3,a4,a5,a6,a7;
4110
    /*step 1*/
4111
    a1 = W1*b[1]+W7*b[7];
4112
    a7 = W7*b[1]-W1*b[7];
4113
    a5 = W5*b[5]+W3*b[3];
4114
    a3 = W3*b[5]-W5*b[3];
4115
    a2 = W2*b[2]+W6*b[6];
4116
    a6 = W6*b[2]-W2*b[6];
4117
    a0 = W0*b[0]+W0*b[4];
4118
    a4 = W0*b[0]-W0*b[4];
4119
    /*step 2*/
4120
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4121
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4122
    /*step 3*/
4123
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4124
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4125
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4126
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4127
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4128
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4129
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4130
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4131
}
4132
static void wmv2_idct_col(short * b)
4133
{
4134
    int s1,s2;
4135
    int a0,a1,a2,a3,a4,a5,a6,a7;
4136
    /*step 1, with extended precision*/
4137
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4138
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4139
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4140
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4141
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4142
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4143
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4144
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4145
    /*step 2*/
4146
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4147
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4148
    /*step 3*/
4149
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4150
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4151
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4152
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4153

    
4154
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4155
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4156
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4157
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4158
}
4159
void ff_wmv2_idct_c(short * block){
4160
    int i;
4161

    
4162
    for(i=0;i<64;i+=8){
4163
        wmv2_idct_row(block+i);
4164
    }
4165
    for(i=0;i<8;i++){
4166
        wmv2_idct_col(block+i);
4167
    }
4168
}
4169
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4170
 converted */
4171
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4172
{
4173
    ff_wmv2_idct_c(block);
4174
    put_pixels_clamped_c(block, dest, line_size);
4175
}
4176
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4177
{
4178
    ff_wmv2_idct_c(block);
4179
    add_pixels_clamped_c(block, dest, line_size);
4180
}
4181
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4182
{
4183
    j_rev_dct (block);
4184
    put_pixels_clamped_c(block, dest, line_size);
4185
}
4186
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4187
{
4188
    j_rev_dct (block);
4189
    add_pixels_clamped_c(block, dest, line_size);
4190
}
4191

    
4192
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4193
{
4194
    j_rev_dct4 (block);
4195
    put_pixels_clamped4_c(block, dest, line_size);
4196
}
4197
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4198
{
4199
    j_rev_dct4 (block);
4200
    add_pixels_clamped4_c(block, dest, line_size);
4201
}
4202

    
4203
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4204
{
4205
    j_rev_dct2 (block);
4206
    put_pixels_clamped2_c(block, dest, line_size);
4207
}
4208
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4209
{
4210
    j_rev_dct2 (block);
4211
    add_pixels_clamped2_c(block, dest, line_size);
4212
}
4213

    
4214
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4215
{
4216
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4217

    
4218
    dest[0] = cm[(block[0] + 4)>>3];
4219
}
4220
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4221
{
4222
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4223

    
4224
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4225
}
4226

    
4227
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4228

    
4229
/* init static data */
4230
void dsputil_static_init(void)
4231
{
4232
    int i;
4233

    
4234
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4235
    for(i=0;i<MAX_NEG_CROP;i++) {
4236
        ff_cropTbl[i] = 0;
4237
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4238
    }
4239

    
4240
    for(i=0;i<512;i++) {