Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ b1159ad9

History | View | Annotate | Download (173 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file libavcodec/dsputil.c
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "h263.h"
37
#include "snow.h"
38

    
39
/* snow.c */
40
void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41

    
42
/* vorbis.c */
43
void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44

    
45
/* ac3dec.c */
46
void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47

    
48
/* lpc.c */
49
void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50

    
51
/* pngdec.c */
52
void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53

    
54
/* eaidct.c */
55
void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56

    
57
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58
uint32_t ff_squareTbl[512] = {0, };
59

    
60
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61
#define pb_7f (~0UL/255 * 0x7f)
62
#define pb_80 (~0UL/255 * 0x80)
63

    
64
const uint8_t ff_zigzag_direct[64] = {
65
    0,   1,  8, 16,  9,  2,  3, 10,
66
    17, 24, 32, 25, 18, 11,  4,  5,
67
    12, 19, 26, 33, 40, 48, 41, 34,
68
    27, 20, 13,  6,  7, 14, 21, 28,
69
    35, 42, 49, 56, 57, 50, 43, 36,
70
    29, 22, 15, 23, 30, 37, 44, 51,
71
    58, 59, 52, 45, 38, 31, 39, 46,
72
    53, 60, 61, 54, 47, 55, 62, 63
73
};
74

    
75
/* Specific zigzag scan for 248 idct. NOTE that unlike the
76
   specification, we interleave the fields */
77
const uint8_t ff_zigzag248_direct[64] = {
78
     0,  8,  1,  9, 16, 24,  2, 10,
79
    17, 25, 32, 40, 48, 56, 33, 41,
80
    18, 26,  3, 11,  4, 12, 19, 27,
81
    34, 42, 49, 57, 50, 58, 35, 43,
82
    20, 28,  5, 13,  6, 14, 21, 29,
83
    36, 44, 51, 59, 52, 60, 37, 45,
84
    22, 30,  7, 15, 23, 31, 38, 46,
85
    53, 61, 54, 62, 39, 47, 55, 63,
86
};
87

    
88
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89
DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
90

    
91
const uint8_t ff_alternate_horizontal_scan[64] = {
92
    0,  1,   2,  3,  8,  9, 16, 17,
93
    10, 11,  4,  5,  6,  7, 15, 14,
94
    13, 12, 19, 18, 24, 25, 32, 33,
95
    26, 27, 20, 21, 22, 23, 28, 29,
96
    30, 31, 34, 35, 40, 41, 48, 49,
97
    42, 43, 36, 37, 38, 39, 44, 45,
98
    46, 47, 50, 51, 56, 57, 58, 59,
99
    52, 53, 54, 55, 60, 61, 62, 63,
100
};
101

    
102
const uint8_t ff_alternate_vertical_scan[64] = {
103
    0,  8,  16, 24,  1,  9,  2, 10,
104
    17, 25, 32, 40, 48, 56, 57, 49,
105
    41, 33, 26, 18,  3, 11,  4, 12,
106
    19, 27, 34, 42, 50, 58, 35, 43,
107
    51, 59, 20, 28,  5, 13,  6, 14,
108
    21, 29, 36, 44, 52, 60, 37, 45,
109
    53, 61, 22, 30,  7, 15, 23, 31,
110
    38, 46, 54, 62, 39, 47, 55, 63,
111
};
112

    
113
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
114
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
115
const uint32_t ff_inverse[257]={
116
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
117
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
118
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
119
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
120
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
121
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
122
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
123
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
124
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
125
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
126
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
127
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
128
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
129
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
130
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
131
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
132
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
133
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
134
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
135
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
136
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
137
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
138
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
139
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
140
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
141
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
142
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
143
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
144
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
145
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
146
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
147
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
148
  16777216
149
};
150

    
151
/* Input permutation for the simple_idct_mmx */
152
static const uint8_t simple_mmx_permutation[64]={
153
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
154
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
155
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
156
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
157
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
158
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
159
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
160
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
161
};
162

    
163
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
164

    
165
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
166
    int i;
167
    int end;
168

    
169
    st->scantable= src_scantable;
170

    
171
    for(i=0; i<64; i++){
172
        int j;
173
        j = src_scantable[i];
174
        st->permutated[i] = permutation[j];
175
#if ARCH_PPC
176
        st->inverse[j] = i;
177
#endif
178
    }
179

    
180
    end=-1;
181
    for(i=0; i<64; i++){
182
        int j;
183
        j = st->permutated[i];
184
        if(j>end) end=j;
185
        st->raster_end[i]= end;
186
    }
187
}
188

    
189
static int pix_sum_c(uint8_t * pix, int line_size)
190
{
191
    int s, i, j;
192

    
193
    s = 0;
194
    for (i = 0; i < 16; i++) {
195
        for (j = 0; j < 16; j += 8) {
196
            s += pix[0];
197
            s += pix[1];
198
            s += pix[2];
199
            s += pix[3];
200
            s += pix[4];
201
            s += pix[5];
202
            s += pix[6];
203
            s += pix[7];
204
            pix += 8;
205
        }
206
        pix += line_size - 16;
207
    }
208
    return s;
209
}
210

    
211
static int pix_norm1_c(uint8_t * pix, int line_size)
212
{
213
    int s, i, j;
214
    uint32_t *sq = ff_squareTbl + 256;
215

    
216
    s = 0;
217
    for (i = 0; i < 16; i++) {
218
        for (j = 0; j < 16; j += 8) {
219
#if 0
220
            s += sq[pix[0]];
221
            s += sq[pix[1]];
222
            s += sq[pix[2]];
223
            s += sq[pix[3]];
224
            s += sq[pix[4]];
225
            s += sq[pix[5]];
226
            s += sq[pix[6]];
227
            s += sq[pix[7]];
228
#else
229
#if LONG_MAX > 2147483647
230
            register uint64_t x=*(uint64_t*)pix;
231
            s += sq[x&0xff];
232
            s += sq[(x>>8)&0xff];
233
            s += sq[(x>>16)&0xff];
234
            s += sq[(x>>24)&0xff];
235
            s += sq[(x>>32)&0xff];
236
            s += sq[(x>>40)&0xff];
237
            s += sq[(x>>48)&0xff];
238
            s += sq[(x>>56)&0xff];
239
#else
240
            register uint32_t x=*(uint32_t*)pix;
241
            s += sq[x&0xff];
242
            s += sq[(x>>8)&0xff];
243
            s += sq[(x>>16)&0xff];
244
            s += sq[(x>>24)&0xff];
245
            x=*(uint32_t*)(pix+4);
246
            s += sq[x&0xff];
247
            s += sq[(x>>8)&0xff];
248
            s += sq[(x>>16)&0xff];
249
            s += sq[(x>>24)&0xff];
250
#endif
251
#endif
252
            pix += 8;
253
        }
254
        pix += line_size - 16;
255
    }
256
    return s;
257
}
258

    
259
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
260
    int i;
261

    
262
    for(i=0; i+8<=w; i+=8){
263
        dst[i+0]= bswap_32(src[i+0]);
264
        dst[i+1]= bswap_32(src[i+1]);
265
        dst[i+2]= bswap_32(src[i+2]);
266
        dst[i+3]= bswap_32(src[i+3]);
267
        dst[i+4]= bswap_32(src[i+4]);
268
        dst[i+5]= bswap_32(src[i+5]);
269
        dst[i+6]= bswap_32(src[i+6]);
270
        dst[i+7]= bswap_32(src[i+7]);
271
    }
272
    for(;i<w; i++){
273
        dst[i+0]= bswap_32(src[i+0]);
274
    }
275
}
276

    
277
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
278
{
279
    int s, i;
280
    uint32_t *sq = ff_squareTbl + 256;
281

    
282
    s = 0;
283
    for (i = 0; i < h; i++) {
284
        s += sq[pix1[0] - pix2[0]];
285
        s += sq[pix1[1] - pix2[1]];
286
        s += sq[pix1[2] - pix2[2]];
287
        s += sq[pix1[3] - pix2[3]];
288
        pix1 += line_size;
289
        pix2 += line_size;
290
    }
291
    return s;
292
}
293

    
294
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
295
{
296
    int s, i;
297
    uint32_t *sq = ff_squareTbl + 256;
298

    
299
    s = 0;
300
    for (i = 0; i < h; i++) {
301
        s += sq[pix1[0] - pix2[0]];
302
        s += sq[pix1[1] - pix2[1]];
303
        s += sq[pix1[2] - pix2[2]];
304
        s += sq[pix1[3] - pix2[3]];
305
        s += sq[pix1[4] - pix2[4]];
306
        s += sq[pix1[5] - pix2[5]];
307
        s += sq[pix1[6] - pix2[6]];
308
        s += sq[pix1[7] - pix2[7]];
309
        pix1 += line_size;
310
        pix2 += line_size;
311
    }
312
    return s;
313
}
314

    
315
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
316
{
317
    int s, i;
318
    uint32_t *sq = ff_squareTbl + 256;
319

    
320
    s = 0;
321
    for (i = 0; i < h; i++) {
322
        s += sq[pix1[ 0] - pix2[ 0]];
323
        s += sq[pix1[ 1] - pix2[ 1]];
324
        s += sq[pix1[ 2] - pix2[ 2]];
325
        s += sq[pix1[ 3] - pix2[ 3]];
326
        s += sq[pix1[ 4] - pix2[ 4]];
327
        s += sq[pix1[ 5] - pix2[ 5]];
328
        s += sq[pix1[ 6] - pix2[ 6]];
329
        s += sq[pix1[ 7] - pix2[ 7]];
330
        s += sq[pix1[ 8] - pix2[ 8]];
331
        s += sq[pix1[ 9] - pix2[ 9]];
332
        s += sq[pix1[10] - pix2[10]];
333
        s += sq[pix1[11] - pix2[11]];
334
        s += sq[pix1[12] - pix2[12]];
335
        s += sq[pix1[13] - pix2[13]];
336
        s += sq[pix1[14] - pix2[14]];
337
        s += sq[pix1[15] - pix2[15]];
338

    
339
        pix1 += line_size;
340
        pix2 += line_size;
341
    }
342
    return s;
343
}
344

    
345

    
346
#if CONFIG_SNOW_ENCODER //dwt is in snow.c
347
static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
348
    int s, i, j;
349
    const int dec_count= w==8 ? 3 : 4;
350
    int tmp[32*32];
351
    int level, ori;
352
    static const int scale[2][2][4][4]={
353
      {
354
        {
355
            // 9/7 8x8 dec=3
356
            {268, 239, 239, 213},
357
            {  0, 224, 224, 152},
358
            {  0, 135, 135, 110},
359
        },{
360
            // 9/7 16x16 or 32x32 dec=4
361
            {344, 310, 310, 280},
362
            {  0, 320, 320, 228},
363
            {  0, 175, 175, 136},
364
            {  0, 129, 129, 102},
365
        }
366
      },{
367
        {
368
            // 5/3 8x8 dec=3
369
            {275, 245, 245, 218},
370
            {  0, 230, 230, 156},
371
            {  0, 138, 138, 113},
372
        },{
373
            // 5/3 16x16 or 32x32 dec=4
374
            {352, 317, 317, 286},
375
            {  0, 328, 328, 233},
376
            {  0, 180, 180, 140},
377
            {  0, 132, 132, 105},
378
        }
379
      }
380
    };
381

    
382
    for (i = 0; i < h; i++) {
383
        for (j = 0; j < w; j+=4) {
384
            tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
385
            tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
386
            tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
387
            tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
388
        }
389
        pix1 += line_size;
390
        pix2 += line_size;
391
    }
392

    
393
    ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
394

    
395
    s=0;
396
    assert(w==h);
397
    for(level=0; level<dec_count; level++){
398
        for(ori= level ? 1 : 0; ori<4; ori++){
399
            int size= w>>(dec_count-level);
400
            int sx= (ori&1) ? size : 0;
401
            int stride= 32<<(dec_count-level);
402
            int sy= (ori&2) ? stride>>1 : 0;
403

    
404
            for(i=0; i<size; i++){
405
                for(j=0; j<size; j++){
406
                    int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
407
                    s += FFABS(v);
408
                }
409
            }
410
        }
411
    }
412
    assert(s>=0);
413
    return s>>9;
414
}
415

    
416
static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
417
    return w_c(v, pix1, pix2, line_size,  8, h, 1);
418
}
419

    
420
static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
421
    return w_c(v, pix1, pix2, line_size,  8, h, 0);
422
}
423

    
424
static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
425
    return w_c(v, pix1, pix2, line_size, 16, h, 1);
426
}
427

    
428
static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
429
    return w_c(v, pix1, pix2, line_size, 16, h, 0);
430
}
431

    
432
int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
433
    return w_c(v, pix1, pix2, line_size, 32, h, 1);
434
}
435

    
436
int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
437
    return w_c(v, pix1, pix2, line_size, 32, h, 0);
438
}
439
#endif
440

    
441
/* draw the edges of width 'w' of an image of size width, height */
442
//FIXME check that this is ok for mpeg4 interlaced
443
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
444
{
445
    uint8_t *ptr, *last_line;
446
    int i;
447

    
448
    last_line = buf + (height - 1) * wrap;
449
    for(i=0;i<w;i++) {
450
        /* top and bottom */
451
        memcpy(buf - (i + 1) * wrap, buf, width);
452
        memcpy(last_line + (i + 1) * wrap, last_line, width);
453
    }
454
    /* left and right */
455
    ptr = buf;
456
    for(i=0;i<height;i++) {
457
        memset(ptr - w, ptr[0], w);
458
        memset(ptr + width, ptr[width-1], w);
459
        ptr += wrap;
460
    }
461
    /* corners */
462
    for(i=0;i<w;i++) {
463
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
464
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
465
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
466
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
467
    }
468
}
469

    
470
/**
471
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
472
 * @param buf destination buffer
473
 * @param src source buffer
474
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
475
 * @param block_w width of block
476
 * @param block_h height of block
477
 * @param src_x x coordinate of the top left sample of the block in the source buffer
478
 * @param src_y y coordinate of the top left sample of the block in the source buffer
479
 * @param w width of the source buffer
480
 * @param h height of the source buffer
481
 */
482
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
483
                                    int src_x, int src_y, int w, int h){
484
    int x, y;
485
    int start_y, start_x, end_y, end_x;
486

    
487
    if(src_y>= h){
488
        src+= (h-1-src_y)*linesize;
489
        src_y=h-1;
490
    }else if(src_y<=-block_h){
491
        src+= (1-block_h-src_y)*linesize;
492
        src_y=1-block_h;
493
    }
494
    if(src_x>= w){
495
        src+= (w-1-src_x);
496
        src_x=w-1;
497
    }else if(src_x<=-block_w){
498
        src+= (1-block_w-src_x);
499
        src_x=1-block_w;
500
    }
501

    
502
    start_y= FFMAX(0, -src_y);
503
    start_x= FFMAX(0, -src_x);
504
    end_y= FFMIN(block_h, h-src_y);
505
    end_x= FFMIN(block_w, w-src_x);
506

    
507
    // copy existing part
508
    for(y=start_y; y<end_y; y++){
509
        for(x=start_x; x<end_x; x++){
510
            buf[x + y*linesize]= src[x + y*linesize];
511
        }
512
    }
513

    
514
    //top
515
    for(y=0; y<start_y; y++){
516
        for(x=start_x; x<end_x; x++){
517
            buf[x + y*linesize]= buf[x + start_y*linesize];
518
        }
519
    }
520

    
521
    //bottom
522
    for(y=end_y; y<block_h; y++){
523
        for(x=start_x; x<end_x; x++){
524
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
525
        }
526
    }
527

    
528
    for(y=0; y<block_h; y++){
529
       //left
530
        for(x=0; x<start_x; x++){
531
            buf[x + y*linesize]= buf[start_x + y*linesize];
532
        }
533

    
534
       //right
535
        for(x=end_x; x<block_w; x++){
536
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
537
        }
538
    }
539
}
540

    
541
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
542
{
543
    int i;
544

    
545
    /* read the pixels */
546
    for(i=0;i<8;i++) {
547
        block[0] = pixels[0];
548
        block[1] = pixels[1];
549
        block[2] = pixels[2];
550
        block[3] = pixels[3];
551
        block[4] = pixels[4];
552
        block[5] = pixels[5];
553
        block[6] = pixels[6];
554
        block[7] = pixels[7];
555
        pixels += line_size;
556
        block += 8;
557
    }
558
}
559

    
560
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
561
                          const uint8_t *s2, int stride){
562
    int i;
563

    
564
    /* read the pixels */
565
    for(i=0;i<8;i++) {
566
        block[0] = s1[0] - s2[0];
567
        block[1] = s1[1] - s2[1];
568
        block[2] = s1[2] - s2[2];
569
        block[3] = s1[3] - s2[3];
570
        block[4] = s1[4] - s2[4];
571
        block[5] = s1[5] - s2[5];
572
        block[6] = s1[6] - s2[6];
573
        block[7] = s1[7] - s2[7];
574
        s1 += stride;
575
        s2 += stride;
576
        block += 8;
577
    }
578
}
579

    
580

    
581
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
582
                                 int line_size)
583
{
584
    int i;
585
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
586

    
587
    /* read the pixels */
588
    for(i=0;i<8;i++) {
589
        pixels[0] = cm[block[0]];
590
        pixels[1] = cm[block[1]];
591
        pixels[2] = cm[block[2]];
592
        pixels[3] = cm[block[3]];
593
        pixels[4] = cm[block[4]];
594
        pixels[5] = cm[block[5]];
595
        pixels[6] = cm[block[6]];
596
        pixels[7] = cm[block[7]];
597

    
598
        pixels += line_size;
599
        block += 8;
600
    }
601
}
602

    
603
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
604
                                 int line_size)
605
{
606
    int i;
607
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
608

    
609
    /* read the pixels */
610
    for(i=0;i<4;i++) {
611
        pixels[0] = cm[block[0]];
612
        pixels[1] = cm[block[1]];
613
        pixels[2] = cm[block[2]];
614
        pixels[3] = cm[block[3]];
615

    
616
        pixels += line_size;
617
        block += 8;
618
    }
619
}
620

    
621
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
622
                                 int line_size)
623
{
624
    int i;
625
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
626

    
627
    /* read the pixels */
628
    for(i=0;i<2;i++) {
629
        pixels[0] = cm[block[0]];
630
        pixels[1] = cm[block[1]];
631

    
632
        pixels += line_size;
633
        block += 8;
634
    }
635
}
636

    
637
static void put_signed_pixels_clamped_c(const DCTELEM *block,
638
                                        uint8_t *restrict pixels,
639
                                        int line_size)
640
{
641
    int i, j;
642

    
643
    for (i = 0; i < 8; i++) {
644
        for (j = 0; j < 8; j++) {
645
            if (*block < -128)
646
                *pixels = 0;
647
            else if (*block > 127)
648
                *pixels = 255;
649
            else
650
                *pixels = (uint8_t)(*block + 128);
651
            block++;
652
            pixels++;
653
        }
654
        pixels += (line_size - 8);
655
    }
656
}
657

    
658
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
659
                          int line_size)
660
{
661
    int i;
662
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
663

    
664
    /* read the pixels */
665
    for(i=0;i<8;i++) {
666
        pixels[0] = cm[pixels[0] + block[0]];
667
        pixels[1] = cm[pixels[1] + block[1]];
668
        pixels[2] = cm[pixels[2] + block[2]];
669
        pixels[3] = cm[pixels[3] + block[3]];
670
        pixels[4] = cm[pixels[4] + block[4]];
671
        pixels[5] = cm[pixels[5] + block[5]];
672
        pixels[6] = cm[pixels[6] + block[6]];
673
        pixels[7] = cm[pixels[7] + block[7]];
674
        pixels += line_size;
675
        block += 8;
676
    }
677
}
678

    
679
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
680
                          int line_size)
681
{
682
    int i;
683
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
684

    
685
    /* read the pixels */
686
    for(i=0;i<4;i++) {
687
        pixels[0] = cm[pixels[0] + block[0]];
688
        pixels[1] = cm[pixels[1] + block[1]];
689
        pixels[2] = cm[pixels[2] + block[2]];
690
        pixels[3] = cm[pixels[3] + block[3]];
691
        pixels += line_size;
692
        block += 8;
693
    }
694
}
695

    
696
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
697
                          int line_size)
698
{
699
    int i;
700
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
701

    
702
    /* read the pixels */
703
    for(i=0;i<2;i++) {
704
        pixels[0] = cm[pixels[0] + block[0]];
705
        pixels[1] = cm[pixels[1] + block[1]];
706
        pixels += line_size;
707
        block += 8;
708
    }
709
}
710

    
711
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
712
{
713
    int i;
714
    for(i=0;i<8;i++) {
715
        pixels[0] += block[0];
716
        pixels[1] += block[1];
717
        pixels[2] += block[2];
718
        pixels[3] += block[3];
719
        pixels[4] += block[4];
720
        pixels[5] += block[5];
721
        pixels[6] += block[6];
722
        pixels[7] += block[7];
723
        pixels += line_size;
724
        block += 8;
725
    }
726
}
727

    
728
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
729
{
730
    int i;
731
    for(i=0;i<4;i++) {
732
        pixels[0] += block[0];
733
        pixels[1] += block[1];
734
        pixels[2] += block[2];
735
        pixels[3] += block[3];
736
        pixels += line_size;
737
        block += 4;
738
    }
739
}
740

    
741
static int sum_abs_dctelem_c(DCTELEM *block)
742
{
743
    int sum=0, i;
744
    for(i=0; i<64; i++)
745
        sum+= FFABS(block[i]);
746
    return sum;
747
}
748

    
749
#if 0
750

751
#define PIXOP2(OPNAME, OP) \
752
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
753
{\
754
    int i;\
755
    for(i=0; i<h; i++){\
756
        OP(*((uint64_t*)block), AV_RN64(pixels));\
757
        pixels+=line_size;\
758
        block +=line_size;\
759
    }\
760
}\
761
\
762
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
763
{\
764
    int i;\
765
    for(i=0; i<h; i++){\
766
        const uint64_t a= AV_RN64(pixels  );\
767
        const uint64_t b= AV_RN64(pixels+1);\
768
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
769
        pixels+=line_size;\
770
        block +=line_size;\
771
    }\
772
}\
773
\
774
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
775
{\
776
    int i;\
777
    for(i=0; i<h; i++){\
778
        const uint64_t a= AV_RN64(pixels  );\
779
        const uint64_t b= AV_RN64(pixels+1);\
780
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
781
        pixels+=line_size;\
782
        block +=line_size;\
783
    }\
784
}\
785
\
786
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
787
{\
788
    int i;\
789
    for(i=0; i<h; i++){\
790
        const uint64_t a= AV_RN64(pixels          );\
791
        const uint64_t b= AV_RN64(pixels+line_size);\
792
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
793
        pixels+=line_size;\
794
        block +=line_size;\
795
    }\
796
}\
797
\
798
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
799
{\
800
    int i;\
801
    for(i=0; i<h; i++){\
802
        const uint64_t a= AV_RN64(pixels          );\
803
        const uint64_t b= AV_RN64(pixels+line_size);\
804
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
805
        pixels+=line_size;\
806
        block +=line_size;\
807
    }\
808
}\
809
\
810
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
811
{\
812
        int i;\
813
        const uint64_t a= AV_RN64(pixels  );\
814
        const uint64_t b= AV_RN64(pixels+1);\
815
        uint64_t l0=  (a&0x0303030303030303ULL)\
816
                    + (b&0x0303030303030303ULL)\
817
                    + 0x0202020202020202ULL;\
818
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
819
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
820
        uint64_t l1,h1;\
821
\
822
        pixels+=line_size;\
823
        for(i=0; i<h; i+=2){\
824
            uint64_t a= AV_RN64(pixels  );\
825
            uint64_t b= AV_RN64(pixels+1);\
826
            l1=  (a&0x0303030303030303ULL)\
827
               + (b&0x0303030303030303ULL);\
828
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
829
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
830
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
831
            pixels+=line_size;\
832
            block +=line_size;\
833
            a= AV_RN64(pixels  );\
834
            b= AV_RN64(pixels+1);\
835
            l0=  (a&0x0303030303030303ULL)\
836
               + (b&0x0303030303030303ULL)\
837
               + 0x0202020202020202ULL;\
838
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
839
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
840
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
841
            pixels+=line_size;\
842
            block +=line_size;\
843
        }\
844
}\
845
\
846
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
847
{\
848
        int i;\
849
        const uint64_t a= AV_RN64(pixels  );\
850
        const uint64_t b= AV_RN64(pixels+1);\
851
        uint64_t l0=  (a&0x0303030303030303ULL)\
852
                    + (b&0x0303030303030303ULL)\
853
                    + 0x0101010101010101ULL;\
854
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
855
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
856
        uint64_t l1,h1;\
857
\
858
        pixels+=line_size;\
859
        for(i=0; i<h; i+=2){\
860
            uint64_t a= AV_RN64(pixels  );\
861
            uint64_t b= AV_RN64(pixels+1);\
862
            l1=  (a&0x0303030303030303ULL)\
863
               + (b&0x0303030303030303ULL);\
864
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
865
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
866
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
867
            pixels+=line_size;\
868
            block +=line_size;\
869
            a= AV_RN64(pixels  );\
870
            b= AV_RN64(pixels+1);\
871
            l0=  (a&0x0303030303030303ULL)\
872
               + (b&0x0303030303030303ULL)\
873
               + 0x0101010101010101ULL;\
874
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
875
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
876
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
877
            pixels+=line_size;\
878
            block +=line_size;\
879
        }\
880
}\
881
\
882
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
883
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
884
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
885
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
886
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
887
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
888
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
889

890
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
891
#else // 64 bit variant
892

    
893
#define PIXOP2(OPNAME, OP) \
894
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895
    int i;\
896
    for(i=0; i<h; i++){\
897
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
898
        pixels+=line_size;\
899
        block +=line_size;\
900
    }\
901
}\
902
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
903
    int i;\
904
    for(i=0; i<h; i++){\
905
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
906
        pixels+=line_size;\
907
        block +=line_size;\
908
    }\
909
}\
910
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
911
    int i;\
912
    for(i=0; i<h; i++){\
913
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
914
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
915
        pixels+=line_size;\
916
        block +=line_size;\
917
    }\
918
}\
919
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
920
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
921
}\
922
\
923
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
924
                                                int src_stride1, int src_stride2, int h){\
925
    int i;\
926
    for(i=0; i<h; i++){\
927
        uint32_t a,b;\
928
        a= AV_RN32(&src1[i*src_stride1  ]);\
929
        b= AV_RN32(&src2[i*src_stride2  ]);\
930
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
931
        a= AV_RN32(&src1[i*src_stride1+4]);\
932
        b= AV_RN32(&src2[i*src_stride2+4]);\
933
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
934
    }\
935
}\
936
\
937
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
938
                                                int src_stride1, int src_stride2, int h){\
939
    int i;\
940
    for(i=0; i<h; i++){\
941
        uint32_t a,b;\
942
        a= AV_RN32(&src1[i*src_stride1  ]);\
943
        b= AV_RN32(&src2[i*src_stride2  ]);\
944
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
945
        a= AV_RN32(&src1[i*src_stride1+4]);\
946
        b= AV_RN32(&src2[i*src_stride2+4]);\
947
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
948
    }\
949
}\
950
\
951
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
952
                                                int src_stride1, int src_stride2, int h){\
953
    int i;\
954
    for(i=0; i<h; i++){\
955
        uint32_t a,b;\
956
        a= AV_RN32(&src1[i*src_stride1  ]);\
957
        b= AV_RN32(&src2[i*src_stride2  ]);\
958
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
959
    }\
960
}\
961
\
962
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
963
                                                int src_stride1, int src_stride2, int h){\
964
    int i;\
965
    for(i=0; i<h; i++){\
966
        uint32_t a,b;\
967
        a= AV_RN16(&src1[i*src_stride1  ]);\
968
        b= AV_RN16(&src2[i*src_stride2  ]);\
969
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
970
    }\
971
}\
972
\
973
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
974
                                                int src_stride1, int src_stride2, int h){\
975
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
976
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
977
}\
978
\
979
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
980
                                                int src_stride1, int src_stride2, int h){\
981
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
982
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
983
}\
984
\
985
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
986
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987
}\
988
\
989
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
990
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
991
}\
992
\
993
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
994
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
995
}\
996
\
997
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
998
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
999
}\
1000
\
1001
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1002
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1003
    int i;\
1004
    for(i=0; i<h; i++){\
1005
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1006
        a= AV_RN32(&src1[i*src_stride1]);\
1007
        b= AV_RN32(&src2[i*src_stride2]);\
1008
        c= AV_RN32(&src3[i*src_stride3]);\
1009
        d= AV_RN32(&src4[i*src_stride4]);\
1010
        l0=  (a&0x03030303UL)\
1011
           + (b&0x03030303UL)\
1012
           + 0x02020202UL;\
1013
        h0= ((a&0xFCFCFCFCUL)>>2)\
1014
          + ((b&0xFCFCFCFCUL)>>2);\
1015
        l1=  (c&0x03030303UL)\
1016
           + (d&0x03030303UL);\
1017
        h1= ((c&0xFCFCFCFCUL)>>2)\
1018
          + ((d&0xFCFCFCFCUL)>>2);\
1019
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1020
        a= AV_RN32(&src1[i*src_stride1+4]);\
1021
        b= AV_RN32(&src2[i*src_stride2+4]);\
1022
        c= AV_RN32(&src3[i*src_stride3+4]);\
1023
        d= AV_RN32(&src4[i*src_stride4+4]);\
1024
        l0=  (a&0x03030303UL)\
1025
           + (b&0x03030303UL)\
1026
           + 0x02020202UL;\
1027
        h0= ((a&0xFCFCFCFCUL)>>2)\
1028
          + ((b&0xFCFCFCFCUL)>>2);\
1029
        l1=  (c&0x03030303UL)\
1030
           + (d&0x03030303UL);\
1031
        h1= ((c&0xFCFCFCFCUL)>>2)\
1032
          + ((d&0xFCFCFCFCUL)>>2);\
1033
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034
    }\
1035
}\
1036
\
1037
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1038
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1039
}\
1040
\
1041
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1042
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1043
}\
1044
\
1045
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1046
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1047
}\
1048
\
1049
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1050
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1051
}\
1052
\
1053
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1054
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1055
    int i;\
1056
    for(i=0; i<h; i++){\
1057
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1058
        a= AV_RN32(&src1[i*src_stride1]);\
1059
        b= AV_RN32(&src2[i*src_stride2]);\
1060
        c= AV_RN32(&src3[i*src_stride3]);\
1061
        d= AV_RN32(&src4[i*src_stride4]);\
1062
        l0=  (a&0x03030303UL)\
1063
           + (b&0x03030303UL)\
1064
           + 0x01010101UL;\
1065
        h0= ((a&0xFCFCFCFCUL)>>2)\
1066
          + ((b&0xFCFCFCFCUL)>>2);\
1067
        l1=  (c&0x03030303UL)\
1068
           + (d&0x03030303UL);\
1069
        h1= ((c&0xFCFCFCFCUL)>>2)\
1070
          + ((d&0xFCFCFCFCUL)>>2);\
1071
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1072
        a= AV_RN32(&src1[i*src_stride1+4]);\
1073
        b= AV_RN32(&src2[i*src_stride2+4]);\
1074
        c= AV_RN32(&src3[i*src_stride3+4]);\
1075
        d= AV_RN32(&src4[i*src_stride4+4]);\
1076
        l0=  (a&0x03030303UL)\
1077
           + (b&0x03030303UL)\
1078
           + 0x01010101UL;\
1079
        h0= ((a&0xFCFCFCFCUL)>>2)\
1080
          + ((b&0xFCFCFCFCUL)>>2);\
1081
        l1=  (c&0x03030303UL)\
1082
           + (d&0x03030303UL);\
1083
        h1= ((c&0xFCFCFCFCUL)>>2)\
1084
          + ((d&0xFCFCFCFCUL)>>2);\
1085
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086
    }\
1087
}\
1088
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1089
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1090
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092
}\
1093
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1094
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1095
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1096
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097
}\
1098
\
1099
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1100
{\
1101
        int i, a0, b0, a1, b1;\
1102
        a0= pixels[0];\
1103
        b0= pixels[1] + 2;\
1104
        a0 += b0;\
1105
        b0 += pixels[2];\
1106
\
1107
        pixels+=line_size;\
1108
        for(i=0; i<h; i+=2){\
1109
            a1= pixels[0];\
1110
            b1= pixels[1];\
1111
            a1 += b1;\
1112
            b1 += pixels[2];\
1113
\
1114
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1115
            block[1]= (b1+b0)>>2;\
1116
\
1117
            pixels+=line_size;\
1118
            block +=line_size;\
1119
\
1120
            a0= pixels[0];\
1121
            b0= pixels[1] + 2;\
1122
            a0 += b0;\
1123
            b0 += pixels[2];\
1124
\
1125
            block[0]= (a1+a0)>>2;\
1126
            block[1]= (b1+b0)>>2;\
1127
            pixels+=line_size;\
1128
            block +=line_size;\
1129
        }\
1130
}\
1131
\
1132
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1133
{\
1134
        int i;\
1135
        const uint32_t a= AV_RN32(pixels  );\
1136
        const uint32_t b= AV_RN32(pixels+1);\
1137
        uint32_t l0=  (a&0x03030303UL)\
1138
                    + (b&0x03030303UL)\
1139
                    + 0x02020202UL;\
1140
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1141
                   + ((b&0xFCFCFCFCUL)>>2);\
1142
        uint32_t l1,h1;\
1143
\
1144
        pixels+=line_size;\
1145
        for(i=0; i<h; i+=2){\
1146
            uint32_t a= AV_RN32(pixels  );\
1147
            uint32_t b= AV_RN32(pixels+1);\
1148
            l1=  (a&0x03030303UL)\
1149
               + (b&0x03030303UL);\
1150
            h1= ((a&0xFCFCFCFCUL)>>2)\
1151
              + ((b&0xFCFCFCFCUL)>>2);\
1152
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1153
            pixels+=line_size;\
1154
            block +=line_size;\
1155
            a= AV_RN32(pixels  );\
1156
            b= AV_RN32(pixels+1);\
1157
            l0=  (a&0x03030303UL)\
1158
               + (b&0x03030303UL)\
1159
               + 0x02020202UL;\
1160
            h0= ((a&0xFCFCFCFCUL)>>2)\
1161
              + ((b&0xFCFCFCFCUL)>>2);\
1162
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1163
            pixels+=line_size;\
1164
            block +=line_size;\
1165
        }\
1166
}\
1167
\
1168
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1169
{\
1170
    int j;\
1171
    for(j=0; j<2; j++){\
1172
        int i;\
1173
        const uint32_t a= AV_RN32(pixels  );\
1174
        const uint32_t b= AV_RN32(pixels+1);\
1175
        uint32_t l0=  (a&0x03030303UL)\
1176
                    + (b&0x03030303UL)\
1177
                    + 0x02020202UL;\
1178
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1179
                   + ((b&0xFCFCFCFCUL)>>2);\
1180
        uint32_t l1,h1;\
1181
\
1182
        pixels+=line_size;\
1183
        for(i=0; i<h; i+=2){\
1184
            uint32_t a= AV_RN32(pixels  );\
1185
            uint32_t b= AV_RN32(pixels+1);\
1186
            l1=  (a&0x03030303UL)\
1187
               + (b&0x03030303UL);\
1188
            h1= ((a&0xFCFCFCFCUL)>>2)\
1189
              + ((b&0xFCFCFCFCUL)>>2);\
1190
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1191
            pixels+=line_size;\
1192
            block +=line_size;\
1193
            a= AV_RN32(pixels  );\
1194
            b= AV_RN32(pixels+1);\
1195
            l0=  (a&0x03030303UL)\
1196
               + (b&0x03030303UL)\
1197
               + 0x02020202UL;\
1198
            h0= ((a&0xFCFCFCFCUL)>>2)\
1199
              + ((b&0xFCFCFCFCUL)>>2);\
1200
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1201
            pixels+=line_size;\
1202
            block +=line_size;\
1203
        }\
1204
        pixels+=4-line_size*(h+1);\
1205
        block +=4-line_size*h;\
1206
    }\
1207
}\
1208
\
1209
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1210
{\
1211
    int j;\
1212
    for(j=0; j<2; j++){\
1213
        int i;\
1214
        const uint32_t a= AV_RN32(pixels  );\
1215
        const uint32_t b= AV_RN32(pixels+1);\
1216
        uint32_t l0=  (a&0x03030303UL)\
1217
                    + (b&0x03030303UL)\
1218
                    + 0x01010101UL;\
1219
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1220
                   + ((b&0xFCFCFCFCUL)>>2);\
1221
        uint32_t l1,h1;\
1222
\
1223
        pixels+=line_size;\
1224
        for(i=0; i<h; i+=2){\
1225
            uint32_t a= AV_RN32(pixels  );\
1226
            uint32_t b= AV_RN32(pixels+1);\
1227
            l1=  (a&0x03030303UL)\
1228
               + (b&0x03030303UL);\
1229
            h1= ((a&0xFCFCFCFCUL)>>2)\
1230
              + ((b&0xFCFCFCFCUL)>>2);\
1231
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1232
            pixels+=line_size;\
1233
            block +=line_size;\
1234
            a= AV_RN32(pixels  );\
1235
            b= AV_RN32(pixels+1);\
1236
            l0=  (a&0x03030303UL)\
1237
               + (b&0x03030303UL)\
1238
               + 0x01010101UL;\
1239
            h0= ((a&0xFCFCFCFCUL)>>2)\
1240
              + ((b&0xFCFCFCFCUL)>>2);\
1241
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1242
            pixels+=line_size;\
1243
            block +=line_size;\
1244
        }\
1245
        pixels+=4-line_size*(h+1);\
1246
        block +=4-line_size*h;\
1247
    }\
1248
}\
1249
\
1250
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1251
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1252
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1253
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1254
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1255
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1256
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1257
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1258

    
1259
#define op_avg(a, b) a = rnd_avg32(a, b)
1260
#endif
1261
#define op_put(a, b) a = b
1262

    
1263
PIXOP2(avg, op_avg)
1264
PIXOP2(put, op_put)
1265
#undef op_avg
1266
#undef op_put
1267

    
1268
#define avg2(a,b) ((a+b+1)>>1)
1269
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1270

    
1271
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1272
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1273
}
1274

    
1275
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1276
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1277
}
1278

    
1279
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1280
{
1281
    const int A=(16-x16)*(16-y16);
1282
    const int B=(   x16)*(16-y16);
1283
    const int C=(16-x16)*(   y16);
1284
    const int D=(   x16)*(   y16);
1285
    int i;
1286

    
1287
    for(i=0; i<h; i++)
1288
    {
1289
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1290
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1291
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1292
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1293
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1294
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1295
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1296
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1297
        dst+= stride;
1298
        src+= stride;
1299
    }
1300
}
1301

    
1302
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1303
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1304
{
1305
    int y, vx, vy;
1306
    const int s= 1<<shift;
1307

    
1308
    width--;
1309
    height--;
1310

    
1311
    for(y=0; y<h; y++){
1312
        int x;
1313

    
1314
        vx= ox;
1315
        vy= oy;
1316
        for(x=0; x<8; x++){ //XXX FIXME optimize
1317
            int src_x, src_y, frac_x, frac_y, index;
1318

    
1319
            src_x= vx>>16;
1320
            src_y= vy>>16;
1321
            frac_x= src_x&(s-1);
1322
            frac_y= src_y&(s-1);
1323
            src_x>>=shift;
1324
            src_y>>=shift;
1325

    
1326
            if((unsigned)src_x < width){
1327
                if((unsigned)src_y < height){
1328
                    index= src_x + src_y*stride;
1329
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1330
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1331
                                        + (  src[index+stride  ]*(s-frac_x)
1332
                                           + src[index+stride+1]*   frac_x )*   frac_y
1333
                                        + r)>>(shift*2);
1334
                }else{
1335
                    index= src_x + av_clip(src_y, 0, height)*stride;
1336
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1337
                                          + src[index       +1]*   frac_x )*s
1338
                                        + r)>>(shift*2);
1339
                }
1340
            }else{
1341
                if((unsigned)src_y < height){
1342
                    index= av_clip(src_x, 0, width) + src_y*stride;
1343
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1344
                                           + src[index+stride  ]*   frac_y )*s
1345
                                        + r)>>(shift*2);
1346
                }else{
1347
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1348
                    dst[y*stride + x]=    src[index         ];
1349
                }
1350
            }
1351

    
1352
            vx+= dxx;
1353
            vy+= dyx;
1354
        }
1355
        ox += dxy;
1356
        oy += dyy;
1357
    }
1358
}
1359

    
1360
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361
    switch(width){
1362
    case 2: put_pixels2_c (dst, src, stride, height); break;
1363
    case 4: put_pixels4_c (dst, src, stride, height); break;
1364
    case 8: put_pixels8_c (dst, src, stride, height); break;
1365
    case 16:put_pixels16_c(dst, src, stride, height); break;
1366
    }
1367
}
1368

    
1369
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370
    int i,j;
1371
    for (i=0; i < height; i++) {
1372
      for (j=0; j < width; j++) {
1373
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1374
      }
1375
      src += stride;
1376
      dst += stride;
1377
    }
1378
}
1379

    
1380
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381
    int i,j;
1382
    for (i=0; i < height; i++) {
1383
      for (j=0; j < width; j++) {
1384
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1385
      }
1386
      src += stride;
1387
      dst += stride;
1388
    }
1389
}
1390

    
1391
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392
    int i,j;
1393
    for (i=0; i < height; i++) {
1394
      for (j=0; j < width; j++) {
1395
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1396
      }
1397
      src += stride;
1398
      dst += stride;
1399
    }
1400
}
1401

    
1402
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403
    int i,j;
1404
    for (i=0; i < height; i++) {
1405
      for (j=0; j < width; j++) {
1406
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1407
      }
1408
      src += stride;
1409
      dst += stride;
1410
    }
1411
}
1412

    
1413
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414
    int i,j;
1415
    for (i=0; i < height; i++) {
1416
      for (j=0; j < width; j++) {
1417
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1418
      }
1419
      src += stride;
1420
      dst += stride;
1421
    }
1422
}
1423

    
1424
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425
    int i,j;
1426
    for (i=0; i < height; i++) {
1427
      for (j=0; j < width; j++) {
1428
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1429
      }
1430
      src += stride;
1431
      dst += stride;
1432
    }
1433
}
1434

    
1435
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436
    int i,j;
1437
    for (i=0; i < height; i++) {
1438
      for (j=0; j < width; j++) {
1439
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1440
      }
1441
      src += stride;
1442
      dst += stride;
1443
    }
1444
}
1445

    
1446
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447
    int i,j;
1448
    for (i=0; i < height; i++) {
1449
      for (j=0; j < width; j++) {
1450
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1451
      }
1452
      src += stride;
1453
      dst += stride;
1454
    }
1455
}
1456

    
1457
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458
    switch(width){
1459
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1460
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1461
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1462
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1463
    }
1464
}
1465

    
1466
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1467
    int i,j;
1468
    for (i=0; i < height; i++) {
1469
      for (j=0; j < width; j++) {
1470
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1471
      }
1472
      src += stride;
1473
      dst += stride;
1474
    }
1475
}
1476

    
1477
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1478
    int i,j;
1479
    for (i=0; i < height; i++) {
1480
      for (j=0; j < width; j++) {
1481
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1482
      }
1483
      src += stride;
1484
      dst += stride;
1485
    }
1486
}
1487

    
1488
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1489
    int i,j;
1490
    for (i=0; i < height; i++) {
1491
      for (j=0; j < width; j++) {
1492
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1493
      }
1494
      src += stride;
1495
      dst += stride;
1496
    }
1497
}
1498

    
1499
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1500
    int i,j;
1501
    for (i=0; i < height; i++) {
1502
      for (j=0; j < width; j++) {
1503
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1504
      }
1505
      src += stride;
1506
      dst += stride;
1507
    }
1508
}
1509

    
1510
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1511
    int i,j;
1512
    for (i=0; i < height; i++) {
1513
      for (j=0; j < width; j++) {
1514
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1515
      }
1516
      src += stride;
1517
      dst += stride;
1518
    }
1519
}
1520

    
1521
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522
    int i,j;
1523
    for (i=0; i < height; i++) {
1524
      for (j=0; j < width; j++) {
1525
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1526
      }
1527
      src += stride;
1528
      dst += stride;
1529
    }
1530
}
1531

    
1532
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1533
    int i,j;
1534
    for (i=0; i < height; i++) {
1535
      for (j=0; j < width; j++) {
1536
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1537
      }
1538
      src += stride;
1539
      dst += stride;
1540
    }
1541
}
1542

    
1543
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1544
    int i,j;
1545
    for (i=0; i < height; i++) {
1546
      for (j=0; j < width; j++) {
1547
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1548
      }
1549
      src += stride;
1550
      dst += stride;
1551
    }
1552
}
1553
#if 0
1554
#define TPEL_WIDTH(width)\
1555
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1557
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1559
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1561
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1563
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1565
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1567
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1569
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1571
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1572
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1573
#endif
1574

    
1575
#define H264_CHROMA_MC(OPNAME, OP)\
1576
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1577
    const int A=(8-x)*(8-y);\
1578
    const int B=(  x)*(8-y);\
1579
    const int C=(8-x)*(  y);\
1580
    const int D=(  x)*(  y);\
1581
    int i;\
1582
    \
1583
    assert(x<8 && y<8 && x>=0 && y>=0);\
1584
\
1585
    if(D){\
1586
        for(i=0; i<h; i++){\
1587
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1588
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1589
            dst+= stride;\
1590
            src+= stride;\
1591
        }\
1592
    }else{\
1593
        const int E= B+C;\
1594
        const int step= C ? stride : 1;\
1595
        for(i=0; i<h; i++){\
1596
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1597
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1598
            dst+= stride;\
1599
            src+= stride;\
1600
        }\
1601
    }\
1602
}\
1603
\
1604
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1605
    const int A=(8-x)*(8-y);\
1606
    const int B=(  x)*(8-y);\
1607
    const int C=(8-x)*(  y);\
1608
    const int D=(  x)*(  y);\
1609
    int i;\
1610
    \
1611
    assert(x<8 && y<8 && x>=0 && y>=0);\
1612
\
1613
    if(D){\
1614
        for(i=0; i<h; i++){\
1615
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1616
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1617
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1618
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1619
            dst+= stride;\
1620
            src+= stride;\
1621
        }\
1622
    }else{\
1623
        const int E= B+C;\
1624
        const int step= C ? stride : 1;\
1625
        for(i=0; i<h; i++){\
1626
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1627
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1628
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1629
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1630
            dst+= stride;\
1631
            src+= stride;\
1632
        }\
1633
    }\
1634
}\
1635
\
1636
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1637
    const int A=(8-x)*(8-y);\
1638
    const int B=(  x)*(8-y);\
1639
    const int C=(8-x)*(  y);\
1640
    const int D=(  x)*(  y);\
1641
    int i;\
1642
    \
1643
    assert(x<8 && y<8 && x>=0 && y>=0);\
1644
\
1645
    if(D){\
1646
        for(i=0; i<h; i++){\
1647
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1648
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1649
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1650
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1651
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1652
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1653
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1654
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1655
            dst+= stride;\
1656
            src+= stride;\
1657
        }\
1658
    }else{\
1659
        const int E= B+C;\
1660
        const int step= C ? stride : 1;\
1661
        for(i=0; i<h; i++){\
1662
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1663
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1664
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1665
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1666
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1667
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1668
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1669
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1670
            dst+= stride;\
1671
            src+= stride;\
1672
        }\
1673
    }\
1674
}
1675

    
1676
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1677
#define op_put(a, b) a = (((b) + 32)>>6)
1678

    
1679
H264_CHROMA_MC(put_       , op_put)
1680
H264_CHROMA_MC(avg_       , op_avg)
1681
#undef op_avg
1682
#undef op_put
1683

    
1684
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1685
    const int A=(8-x)*(8-y);
1686
    const int B=(  x)*(8-y);
1687
    const int C=(8-x)*(  y);
1688
    const int D=(  x)*(  y);
1689
    int i;
1690

    
1691
    assert(x<8 && y<8 && x>=0 && y>=0);
1692

    
1693
    for(i=0; i<h; i++)
1694
    {
1695
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1696
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1697
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1698
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1699
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1700
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1701
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1702
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1703
        dst+= stride;
1704
        src+= stride;
1705
    }
1706
}
1707

    
1708
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1709
    const int A=(8-x)*(8-y);
1710
    const int B=(  x)*(8-y);
1711
    const int C=(8-x)*(  y);
1712
    const int D=(  x)*(  y);
1713
    int i;
1714

    
1715
    assert(x<8 && y<8 && x>=0 && y>=0);
1716

    
1717
    for(i=0; i<h; i++)
1718
    {
1719
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1720
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1721
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1722
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1723
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1724
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1725
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1726
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1727
        dst+= stride;
1728
        src+= stride;
1729
    }
1730
}
1731

    
1732
#define QPEL_MC(r, OPNAME, RND, OP) \
1733
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1734
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1735
    int i;\
1736
    for(i=0; i<h; i++)\
1737
    {\
1738
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1739
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1740
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1741
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1742
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1743
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1744
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1745
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1746
        dst+=dstStride;\
1747
        src+=srcStride;\
1748
    }\
1749
}\
1750
\
1751
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1752
    const int w=8;\
1753
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754
    int i;\
1755
    for(i=0; i<w; i++)\
1756
    {\
1757
        const int src0= src[0*srcStride];\
1758
        const int src1= src[1*srcStride];\
1759
        const int src2= src[2*srcStride];\
1760
        const int src3= src[3*srcStride];\
1761
        const int src4= src[4*srcStride];\
1762
        const int src5= src[5*srcStride];\
1763
        const int src6= src[6*srcStride];\
1764
        const int src7= src[7*srcStride];\
1765
        const int src8= src[8*srcStride];\
1766
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1767
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1768
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1769
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1770
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1771
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1772
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1773
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1774
        dst++;\
1775
        src++;\
1776
    }\
1777
}\
1778
\
1779
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1780
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781
    int i;\
1782
    \
1783
    for(i=0; i<h; i++)\
1784
    {\
1785
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1786
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1787
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1788
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1789
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1790
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1791
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1792
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1793
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1794
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1795
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1796
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1797
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1798
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1799
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1800
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1801
        dst+=dstStride;\
1802
        src+=srcStride;\
1803
    }\
1804
}\
1805
\
1806
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1807
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1808
    int i;\
1809
    const int w=16;\
1810
    for(i=0; i<w; i++)\
1811
    {\
1812
        const int src0= src[0*srcStride];\
1813
        const int src1= src[1*srcStride];\
1814
        const int src2= src[2*srcStride];\
1815
        const int src3= src[3*srcStride];\
1816
        const int src4= src[4*srcStride];\
1817
        const int src5= src[5*srcStride];\
1818
        const int src6= src[6*srcStride];\
1819
        const int src7= src[7*srcStride];\
1820
        const int src8= src[8*srcStride];\
1821
        const int src9= src[9*srcStride];\
1822
        const int src10= src[10*srcStride];\
1823
        const int src11= src[11*srcStride];\
1824
        const int src12= src[12*srcStride];\
1825
        const int src13= src[13*srcStride];\
1826
        const int src14= src[14*srcStride];\
1827
        const int src15= src[15*srcStride];\
1828
        const int src16= src[16*srcStride];\
1829
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1830
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1831
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1832
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1833
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1834
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1835
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1836
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1837
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1838
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1839
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1840
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1841
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1842
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1843
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1844
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1845
        dst++;\
1846
        src++;\
1847
    }\
1848
}\
1849
\
1850
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1851
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1852
}\
1853
\
1854
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1855
    uint8_t half[64];\
1856
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1857
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1858
}\
1859
\
1860
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1862
}\
1863
\
1864
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1865
    uint8_t half[64];\
1866
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1867
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1868
}\
1869
\
1870
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1871
    uint8_t full[16*9];\
1872
    uint8_t half[64];\
1873
    copy_block9(full, src, 16, stride, 9);\
1874
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1875
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1876
}\
1877
\
1878
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    uint8_t full[16*9];\
1880
    copy_block9(full, src, 16, stride, 9);\
1881
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1882
}\
1883
\
1884
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1885
    uint8_t full[16*9];\
1886
    uint8_t half[64];\
1887
    copy_block9(full, src, 16, stride, 9);\
1888
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1889
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1890
}\
1891
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1892
    uint8_t full[16*9];\
1893
    uint8_t halfH[72];\
1894
    uint8_t halfV[64];\
1895
    uint8_t halfHV[64];\
1896
    copy_block9(full, src, 16, stride, 9);\
1897
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1898
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1899
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1900
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1901
}\
1902
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1903
    uint8_t full[16*9];\
1904
    uint8_t halfH[72];\
1905
    uint8_t halfHV[64];\
1906
    copy_block9(full, src, 16, stride, 9);\
1907
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1908
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1909
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1911
}\
1912
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1913
    uint8_t full[16*9];\
1914
    uint8_t halfH[72];\
1915
    uint8_t halfV[64];\
1916
    uint8_t halfHV[64];\
1917
    copy_block9(full, src, 16, stride, 9);\
1918
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1919
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1920
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1921
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1922
}\
1923
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1924
    uint8_t full[16*9];\
1925
    uint8_t halfH[72];\
1926
    uint8_t halfHV[64];\
1927
    copy_block9(full, src, 16, stride, 9);\
1928
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1929
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1930
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1931
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1932
}\
1933
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1934
    uint8_t full[16*9];\
1935
    uint8_t halfH[72];\
1936
    uint8_t halfV[64];\
1937
    uint8_t halfHV[64];\
1938
    copy_block9(full, src, 16, stride, 9);\
1939
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1940
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1941
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1942
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1943
}\
1944
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1945
    uint8_t full[16*9];\
1946
    uint8_t halfH[72];\
1947
    uint8_t halfHV[64];\
1948
    copy_block9(full, src, 16, stride, 9);\
1949
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1950
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1951
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1952
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1953
}\
1954
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955
    uint8_t full[16*9];\
1956
    uint8_t halfH[72];\
1957
    uint8_t halfV[64];\
1958
    uint8_t halfHV[64];\
1959
    copy_block9(full, src, 16, stride, 9);\
1960
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1961
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1962
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1964
}\
1965
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1966
    uint8_t full[16*9];\
1967
    uint8_t halfH[72];\
1968
    uint8_t halfHV[64];\
1969
    copy_block9(full, src, 16, stride, 9);\
1970
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1973
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1974
}\
1975
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t halfH[72];\
1977
    uint8_t halfHV[64];\
1978
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1979
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1980
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1981
}\
1982
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1983
    uint8_t halfH[72];\
1984
    uint8_t halfHV[64];\
1985
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1986
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1987
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1988
}\
1989
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    uint8_t full[16*9];\
1991
    uint8_t halfH[72];\
1992
    uint8_t halfV[64];\
1993
    uint8_t halfHV[64];\
1994
    copy_block9(full, src, 16, stride, 9);\
1995
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1996
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1997
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1998
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1999
}\
2000
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t full[16*9];\
2002
    uint8_t halfH[72];\
2003
    copy_block9(full, src, 16, stride, 9);\
2004
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2005
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2006
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2007
}\
2008
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2009
    uint8_t full[16*9];\
2010
    uint8_t halfH[72];\
2011
    uint8_t halfV[64];\
2012
    uint8_t halfHV[64];\
2013
    copy_block9(full, src, 16, stride, 9);\
2014
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2015
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2016
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2017
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2018
}\
2019
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2020
    uint8_t full[16*9];\
2021
    uint8_t halfH[72];\
2022
    copy_block9(full, src, 16, stride, 9);\
2023
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2024
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2025
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2026
}\
2027
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2028
    uint8_t halfH[72];\
2029
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2030
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2031
}\
2032
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2033
    OPNAME ## pixels16_c(dst, src, stride, 16);\
2034
}\
2035
\
2036
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2037
    uint8_t half[256];\
2038
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2039
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2040
}\
2041
\
2042
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2043
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2044
}\
2045
\
2046
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2047
    uint8_t half[256];\
2048
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2049
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2050
}\
2051
\
2052
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2053
    uint8_t full[24*17];\
2054
    uint8_t half[256];\
2055
    copy_block17(full, src, 24, stride, 17);\
2056
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2057
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2058
}\
2059
\
2060
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2061
    uint8_t full[24*17];\
2062
    copy_block17(full, src, 24, stride, 17);\
2063
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2064
}\
2065
\
2066
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2067
    uint8_t full[24*17];\
2068
    uint8_t half[256];\
2069
    copy_block17(full, src, 24, stride, 17);\
2070
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2071
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2072
}\
2073
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2074
    uint8_t full[24*17];\
2075
    uint8_t halfH[272];\
2076
    uint8_t halfV[256];\
2077
    uint8_t halfHV[256];\
2078
    copy_block17(full, src, 24, stride, 17);\
2079
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2080
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2081
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2082
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2083
}\
2084
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2085
    uint8_t full[24*17];\
2086
    uint8_t halfH[272];\
2087
    uint8_t halfHV[256];\
2088
    copy_block17(full, src, 24, stride, 17);\
2089
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2090
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2091
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2093
}\
2094
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2095
    uint8_t full[24*17];\
2096
    uint8_t halfH[272];\
2097
    uint8_t halfV[256];\
2098
    uint8_t halfHV[256];\
2099
    copy_block17(full, src, 24, stride, 17);\
2100
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2101
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2102
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2103
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2104
}\
2105
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2106
    uint8_t full[24*17];\
2107
    uint8_t halfH[272];\
2108
    uint8_t halfHV[256];\
2109
    copy_block17(full, src, 24, stride, 17);\
2110
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2111
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2112
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2113
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2114
}\
2115
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2116
    uint8_t full[24*17];\
2117
    uint8_t halfH[272];\
2118
    uint8_t halfV[256];\
2119
    uint8_t halfHV[256];\
2120
    copy_block17(full, src, 24, stride, 17);\
2121
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2122
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2123
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2124
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2125
}\
2126
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2127
    uint8_t full[24*17];\
2128
    uint8_t halfH[272];\
2129
    uint8_t halfHV[256];\
2130
    copy_block17(full, src, 24, stride, 17);\
2131
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2132
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2133
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2134
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2135
}\
2136
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137
    uint8_t full[24*17];\
2138
    uint8_t halfH[272];\
2139
    uint8_t halfV[256];\
2140
    uint8_t halfHV[256];\
2141
    copy_block17(full, src, 24, stride, 17);\
2142
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2143
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2144
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2146
}\
2147
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2148
    uint8_t full[24*17];\
2149
    uint8_t halfH[272];\
2150
    uint8_t halfHV[256];\
2151
    copy_block17(full, src, 24, stride, 17);\
2152
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2155
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2156
}\
2157
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2158
    uint8_t halfH[272];\
2159
    uint8_t halfHV[256];\
2160
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2161
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2162
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2163
}\
2164
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2165
    uint8_t halfH[272];\
2166
    uint8_t halfHV[256];\
2167
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2168
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2169
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2170
}\
2171
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2172
    uint8_t full[24*17];\
2173
    uint8_t halfH[272];\
2174
    uint8_t halfV[256];\
2175
    uint8_t halfHV[256];\
2176
    copy_block17(full, src, 24, stride, 17);\
2177
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2178
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2179
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2180
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2181
}\
2182
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2183
    uint8_t full[24*17];\
2184
    uint8_t halfH[272];\
2185
    copy_block17(full, src, 24, stride, 17);\
2186
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2187
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2188
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2189
}\
2190
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2191
    uint8_t full[24*17];\
2192
    uint8_t halfH[272];\
2193
    uint8_t halfV[256];\
2194
    uint8_t halfHV[256];\
2195
    copy_block17(full, src, 24, stride, 17);\
2196
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2197
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2198
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2199
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2200
}\
2201
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2202
    uint8_t full[24*17];\
2203
    uint8_t halfH[272];\
2204
    copy_block17(full, src, 24, stride, 17);\
2205
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2206
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2207
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2208
}\
2209
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2210
    uint8_t halfH[272];\
2211
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2212
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2213
}
2214

    
2215
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2216
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2217
#define op_put(a, b) a = cm[((b) + 16)>>5]
2218
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2219

    
2220
QPEL_MC(0, put_       , _       , op_put)
2221
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2222
QPEL_MC(0, avg_       , _       , op_avg)
2223
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2224
#undef op_avg
2225
#undef op_avg_no_rnd
2226
#undef op_put
2227
#undef op_put_no_rnd
2228

    
2229
#if 1
2230
#define H264_LOWPASS(OPNAME, OP, OP2) \
2231
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2232
    const int h=2;\
2233
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2234
    int i;\
2235
    for(i=0; i<h; i++)\
2236
    {\
2237
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2238
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2239
        dst+=dstStride;\
2240
        src+=srcStride;\
2241
    }\
2242
}\
2243
\
2244
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2245
    const int w=2;\
2246
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2247
    int i;\
2248
    for(i=0; i<w; i++)\
2249
    {\
2250
        const int srcB= src[-2*srcStride];\
2251
        const int srcA= src[-1*srcStride];\
2252
        const int src0= src[0 *srcStride];\
2253
        const int src1= src[1 *srcStride];\
2254
        const int src2= src[2 *srcStride];\
2255
        const int src3= src[3 *srcStride];\
2256
        const int src4= src[4 *srcStride];\
2257
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2258
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2259
        dst++;\
2260
        src++;\
2261
    }\
2262
}\
2263
\
2264
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2265
    const int h=2;\
2266
    const int w=2;\
2267
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2268
    int i;\
2269
    src -= 2*srcStride;\
2270
    for(i=0; i<h+5; i++)\
2271
    {\
2272
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2273
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2274
        tmp+=tmpStride;\
2275
        src+=srcStride;\
2276
    }\
2277
    tmp -= tmpStride*(h+5-2);\
2278
    for(i=0; i<w; i++)\
2279
    {\
2280
        const int tmpB= tmp[-2*tmpStride];\
2281
        const int tmpA= tmp[-1*tmpStride];\
2282
        const int tmp0= tmp[0 *tmpStride];\
2283
        const int tmp1= tmp[1 *tmpStride];\
2284
        const int tmp2= tmp[2 *tmpStride];\
2285
        const int tmp3= tmp[3 *tmpStride];\
2286
        const int tmp4= tmp[4 *tmpStride];\
2287
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2288
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2289
        dst++;\
2290
        tmp++;\
2291
    }\
2292
}\
2293
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2294
    const int h=4;\
2295
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2296
    int i;\
2297
    for(i=0; i<h; i++)\
2298
    {\
2299
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2300
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2301
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2302
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2303
        dst+=dstStride;\
2304
        src+=srcStride;\
2305
    }\
2306
}\
2307
\
2308
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309
    const int w=4;\
2310
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2311
    int i;\
2312
    for(i=0; i<w; i++)\
2313
    {\
2314
        const int srcB= src[-2*srcStride];\
2315
        const int srcA= src[-1*srcStride];\
2316
        const int src0= src[0 *srcStride];\
2317
        const int src1= src[1 *srcStride];\
2318
        const int src2= src[2 *srcStride];\
2319
        const int src3= src[3 *srcStride];\
2320
        const int src4= src[4 *srcStride];\
2321
        const int src5= src[5 *srcStride];\
2322
        const int src6= src[6 *srcStride];\
2323
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2324
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2325
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2326
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2327
        dst++;\
2328
        src++;\
2329
    }\
2330
}\
2331
\
2332
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2333
    const int h=4;\
2334
    const int w=4;\
2335
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2336
    int i;\
2337
    src -= 2*srcStride;\
2338
    for(i=0; i<h+5; i++)\
2339
    {\
2340
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2341
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2342
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2343
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2344
        tmp+=tmpStride;\
2345
        src+=srcStride;\
2346
    }\
2347
    tmp -= tmpStride*(h+5-2);\
2348
    for(i=0; i<w; i++)\
2349
    {\
2350
        const int tmpB= tmp[-2*tmpStride];\
2351
        const int tmpA= tmp[-1*tmpStride];\
2352
        const int tmp0= tmp[0 *tmpStride];\
2353
        const int tmp1= tmp[1 *tmpStride];\
2354
        const int tmp2= tmp[2 *tmpStride];\
2355
        const int tmp3= tmp[3 *tmpStride];\
2356
        const int tmp4= tmp[4 *tmpStride];\
2357
        const int tmp5= tmp[5 *tmpStride];\
2358
        const int tmp6= tmp[6 *tmpStride];\
2359
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2360
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2361
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2362
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2363
        dst++;\
2364
        tmp++;\
2365
    }\
2366
}\
2367
\
2368
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2369
    const int h=8;\
2370
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2371
    int i;\
2372
    for(i=0; i<h; i++)\
2373
    {\
2374
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2375
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2376
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2377
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2378
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2379
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2380
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2381
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2382
        dst+=dstStride;\
2383
        src+=srcStride;\
2384
    }\
2385
}\
2386
\
2387
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2388
    const int w=8;\
2389
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2390
    int i;\
2391
    for(i=0; i<w; i++)\
2392
    {\
2393
        const int srcB= src[-2*srcStride];\
2394
        const int srcA= src[-1*srcStride];\
2395
        const int src0= src[0 *srcStride];\
2396
        const int src1= src[1 *srcStride];\
2397
        const int src2= src[2 *srcStride];\
2398
        const int src3= src[3 *srcStride];\
2399
        const int src4= src[4 *srcStride];\
2400
        const int src5= src[5 *srcStride];\
2401
        const int src6= src[6 *srcStride];\
2402
        const int src7= src[7 *srcStride];\
2403
        const int src8= src[8 *srcStride];\
2404
        const int src9= src[9 *srcStride];\
2405
        const int src10=src[10*srcStride];\
2406
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2407
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2408
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2409
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2410
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2411
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2412
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2413
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2414
        dst++;\
2415
        src++;\
2416
    }\
2417
}\
2418
\
2419
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2420
    const int h=8;\
2421
    const int w=8;\
2422
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2423
    int i;\
2424
    src -= 2*srcStride;\
2425
    for(i=0; i<h+5; i++)\
2426
    {\
2427
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2428
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2429
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2430
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2431
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2432
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2433
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2434
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2435
        tmp+=tmpStride;\
2436
        src+=srcStride;\
2437
    }\
2438
    tmp -= tmpStride*(h+5-2);\
2439
    for(i=0; i<w; i++)\
2440
    {\
2441
        const int tmpB= tmp[-2*tmpStride];\
2442
        const int tmpA= tmp[-1*tmpStride];\
2443
        const int tmp0= tmp[0 *tmpStride];\
2444
        const int tmp1= tmp[1 *tmpStride];\
2445
        const int tmp2= tmp[2 *tmpStride];\
2446
        const int tmp3= tmp[3 *tmpStride];\
2447
        const int tmp4= tmp[4 *tmpStride];\
2448
        const int tmp5= tmp[5 *tmpStride];\
2449
        const int tmp6= tmp[6 *tmpStride];\
2450
        const int tmp7= tmp[7 *tmpStride];\
2451
        const int tmp8= tmp[8 *tmpStride];\
2452
        const int tmp9= tmp[9 *tmpStride];\
2453
        const int tmp10=tmp[10*tmpStride];\
2454
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2455
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2456
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2457
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2458
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2459
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2460
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2461
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2462
        dst++;\
2463
        tmp++;\
2464
    }\
2465
}\
2466
\
2467
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2468
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2469
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2470
    src += 8*srcStride;\
2471
    dst += 8*dstStride;\
2472
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2473
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2474
}\
2475
\
2476
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2477
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2478
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2479
    src += 8*srcStride;\
2480
    dst += 8*dstStride;\
2481
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2482
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2483
}\
2484
\
2485
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2486
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2487
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2488
    src += 8*srcStride;\
2489
    dst += 8*dstStride;\
2490
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2491
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2492
}\
2493

    
2494
#define H264_MC(OPNAME, SIZE) \
2495
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2496
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2497
}\
2498
\
2499
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2500
    uint8_t half[SIZE*SIZE];\
2501
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2502
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2503
}\
2504
\
2505
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2506
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2507
}\
2508
\
2509
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2510
    uint8_t half[SIZE*SIZE];\
2511
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2512
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2513
}\
2514
\
2515
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2516
    uint8_t full[SIZE*(SIZE+5)];\
2517
    uint8_t * const full_mid= full + SIZE*2;\
2518
    uint8_t half[SIZE*SIZE];\
2519
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2520
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2521
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2522
}\
2523
\
2524
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2525
    uint8_t full[SIZE*(SIZE+5)];\
2526
    uint8_t * const full_mid= full + SIZE*2;\
2527
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2528
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2529
}\
2530
\
2531
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2532
    uint8_t full[SIZE*(SIZE+5)];\
2533
    uint8_t * const full_mid= full + SIZE*2;\
2534
    uint8_t half[SIZE*SIZE];\
2535
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2536
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2537
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2538
}\
2539
\
2540
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2541
    uint8_t full[SIZE*(SIZE+5)];\
2542
    uint8_t * const full_mid= full + SIZE*2;\
2543
    uint8_t halfH[SIZE*SIZE];\
2544
    uint8_t halfV[SIZE*SIZE];\
2545
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2546
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2547
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2548
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2549
}\
2550
\
2551
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2552
    uint8_t full[SIZE*(SIZE+5)];\
2553
    uint8_t * const full_mid= full + SIZE*2;\
2554
    uint8_t halfH[SIZE*SIZE];\
2555
    uint8_t halfV[SIZE*SIZE];\
2556
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2557
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2558
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2559
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2560
}\
2561
\
2562
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2563
    uint8_t full[SIZE*(SIZE+5)];\
2564
    uint8_t * const full_mid= full + SIZE*2;\
2565
    uint8_t halfH[SIZE*SIZE];\
2566
    uint8_t halfV[SIZE*SIZE];\
2567
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2568
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2569
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2570
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2571
}\
2572
\
2573
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2574
    uint8_t full[SIZE*(SIZE+5)];\
2575
    uint8_t * const full_mid= full + SIZE*2;\
2576
    uint8_t halfH[SIZE*SIZE];\
2577
    uint8_t halfV[SIZE*SIZE];\
2578
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2579
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2580
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2581
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2582
}\
2583
\
2584
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2585
    int16_t tmp[SIZE*(SIZE+5)];\
2586
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2587
}\
2588
\
2589
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2590
    int16_t tmp[SIZE*(SIZE+5)];\
2591
    uint8_t halfH[SIZE*SIZE];\
2592
    uint8_t halfHV[SIZE*SIZE];\
2593
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2594
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2595
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2596
}\
2597
\
2598
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2599
    int16_t tmp[SIZE*(SIZE+5)];\
2600
    uint8_t halfH[SIZE*SIZE];\
2601
    uint8_t halfHV[SIZE*SIZE];\
2602
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2603
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2604
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2605
}\
2606
\
2607
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2608
    uint8_t full[SIZE*(SIZE+5)];\
2609
    uint8_t * const full_mid= full + SIZE*2;\
2610
    int16_t tmp[SIZE*(SIZE+5)];\
2611
    uint8_t halfV[SIZE*SIZE];\
2612
    uint8_t halfHV[SIZE*SIZE];\
2613
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2614
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2615
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2616
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2617
}\
2618
\
2619
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2620
    uint8_t full[SIZE*(SIZE+5)];\
2621
    uint8_t * const full_mid= full + SIZE*2;\
2622
    int16_t tmp[SIZE*(SIZE+5)];\
2623
    uint8_t halfV[SIZE*SIZE];\
2624
    uint8_t halfHV[SIZE*SIZE];\
2625
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2626
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2627
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2628
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2629
}\
2630

    
2631
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2632
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2633
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2634
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2635
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2636

    
2637
H264_LOWPASS(put_       , op_put, op2_put)
2638
H264_LOWPASS(avg_       , op_avg, op2_avg)
2639
H264_MC(put_, 2)
2640
H264_MC(put_, 4)
2641
H264_MC(put_, 8)
2642
H264_MC(put_, 16)
2643
H264_MC(avg_, 4)
2644
H264_MC(avg_, 8)
2645
H264_MC(avg_, 16)
2646

    
2647
#undef op_avg
2648
#undef op_put
2649
#undef op2_avg
2650
#undef op2_put
2651
#endif
2652

    
2653
#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2654
#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2655
#define H264_WEIGHT(W,H) \
2656
static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2657
    int y; \
2658
    offset <<= log2_denom; \
2659
    if(log2_denom) offset += 1<<(log2_denom-1); \
2660
    for(y=0; y<H; y++, block += stride){ \
2661
        op_scale1(0); \
2662
        op_scale1(1); \
2663
        if(W==2) continue; \
2664
        op_scale1(2); \
2665
        op_scale1(3); \
2666
        if(W==4) continue; \
2667
        op_scale1(4); \
2668
        op_scale1(5); \
2669
        op_scale1(6); \
2670
        op_scale1(7); \
2671
        if(W==8) continue; \
2672
        op_scale1(8); \
2673
        op_scale1(9); \
2674
        op_scale1(10); \
2675
        op_scale1(11); \
2676
        op_scale1(12); \
2677
        op_scale1(13); \
2678
        op_scale1(14); \
2679
        op_scale1(15); \
2680
    } \
2681
} \
2682
static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2683
    int y; \
2684
    offset = ((offset + 1) | 1) << log2_denom; \
2685
    for(y=0; y<H; y++, dst += stride, src += stride){ \
2686
        op_scale2(0); \
2687
        op_scale2(1); \
2688
        if(W==2) continue; \
2689
        op_scale2(2); \
2690
        op_scale2(3); \
2691
        if(W==4) continue; \
2692
        op_scale2(4); \
2693
        op_scale2(5); \
2694
        op_scale2(6); \
2695
        op_scale2(7); \
2696
        if(W==8) continue; \
2697
        op_scale2(8); \
2698
        op_scale2(9); \
2699
        op_scale2(10); \
2700
        op_scale2(11); \
2701
        op_scale2(12); \
2702
        op_scale2(13); \
2703
        op_scale2(14); \
2704
        op_scale2(15); \
2705
    } \
2706
}
2707

    
2708
H264_WEIGHT(16,16)
2709
H264_WEIGHT(16,8)
2710
H264_WEIGHT(8,16)
2711
H264_WEIGHT(8,8)
2712
H264_WEIGHT(8,4)
2713
H264_WEIGHT(4,8)
2714
H264_WEIGHT(4,4)
2715
H264_WEIGHT(4,2)
2716
H264_WEIGHT(2,4)
2717
H264_WEIGHT(2,2)
2718

    
2719
#undef op_scale1
2720
#undef op_scale2
2721
#undef H264_WEIGHT
2722

    
2723
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2724
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2725
    int i;
2726

    
2727
    for(i=0; i<h; i++){
2728
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2729
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2730
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2731
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2732
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2733
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2734
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2735
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2736
        dst+=dstStride;
2737
        src+=srcStride;
2738
    }
2739
}
2740

    
2741
#if CONFIG_CAVS_DECODER
2742
/* AVS specific */
2743
void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2744

    
2745
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2746
    put_pixels8_c(dst, src, stride, 8);
2747
}
2748
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2749
    avg_pixels8_c(dst, src, stride, 8);
2750
}
2751
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2752
    put_pixels16_c(dst, src, stride, 16);
2753
}
2754
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2755
    avg_pixels16_c(dst, src, stride, 16);
2756
}
2757
#endif /* CONFIG_CAVS_DECODER */
2758

    
2759
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2760

    
2761
#if CONFIG_VC1_DECODER
2762
/* VC-1 specific */
2763
void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2764

    
2765
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2766
    put_pixels8_c(dst, src, stride, 8);
2767
}
2768
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2769
    avg_pixels8_c(dst, src, stride, 8);
2770
}
2771
#endif /* CONFIG_VC1_DECODER */
2772

    
2773
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2774

    
2775
/* H264 specific */
2776
void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2777

    
2778
#if CONFIG_RV30_DECODER
2779
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2780
#endif /* CONFIG_RV30_DECODER */
2781

    
2782
#if CONFIG_RV40_DECODER
2783
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2784
    put_pixels16_xy2_c(dst, src, stride, 16);
2785
}
2786
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2787
    avg_pixels16_xy2_c(dst, src, stride, 16);
2788
}
2789
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2790
    put_pixels8_xy2_c(dst, src, stride, 8);
2791
}
2792
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2793
    avg_pixels8_xy2_c(dst, src, stride, 8);
2794
}
2795

    
2796
void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2797
#endif /* CONFIG_RV40_DECODER */
2798

    
2799
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2800
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2801
    int i;
2802

    
2803
    for(i=0; i<w; i++){
2804
        const int src_1= src[ -srcStride];
2805
        const int src0 = src[0          ];
2806
        const int src1 = src[  srcStride];
2807
        const int src2 = src[2*srcStride];
2808
        const int src3 = src[3*srcStride];
2809
        const int src4 = src[4*srcStride];
2810
        const int src5 = src[5*srcStride];
2811
        const int src6 = src[6*srcStride];
2812
        const int src7 = src[7*srcStride];
2813
        const int src8 = src[8*srcStride];
2814
        const int src9 = src[9*srcStride];
2815
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2816
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2817
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2818
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2819
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2820
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2821
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2822
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2823
        src++;
2824
        dst++;
2825
    }
2826
}
2827

    
2828
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2829
    put_pixels8_c(dst, src, stride, 8);
2830
}
2831

    
2832
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2833
    uint8_t half[64];
2834
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2835
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2836
}
2837

    
2838
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2839
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2840
}
2841

    
2842
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2843
    uint8_t half[64];
2844
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2845
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2846
}
2847

    
2848
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2849
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2850
}
2851

    
2852
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2853
    uint8_t halfH[88];
2854
    uint8_t halfV[64];
2855
    uint8_t halfHV[64];
2856
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2857
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2858
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2859
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2860
}
2861
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2862
    uint8_t halfH[88];
2863
    uint8_t halfV[64];
2864
    uint8_t halfHV[64];
2865
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2866
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2867
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2868
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2869
}
2870
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2871
    uint8_t halfH[88];
2872
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2873
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2874
}
2875

    
2876
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2877
    if(CONFIG_ANY_H263) {
2878
    int x;
2879
    const int strength= ff_h263_loop_filter_strength[qscale];
2880

    
2881
    for(x=0; x<8; x++){
2882
        int d1, d2, ad1;
2883
        int p0= src[x-2*stride];
2884
        int p1= src[x-1*stride];
2885
        int p2= src[x+0*stride];
2886
        int p3= src[x+1*stride];
2887
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2888

    
2889
        if     (d<-2*strength) d1= 0;
2890
        else if(d<-  strength) d1=-2*strength - d;
2891
        else if(d<   strength) d1= d;
2892
        else if(d< 2*strength) d1= 2*strength - d;
2893
        else                   d1= 0;
2894

    
2895
        p1 += d1;
2896
        p2 -= d1;
2897
        if(p1&256) p1= ~(p1>>31);
2898
        if(p2&256) p2= ~(p2>>31);
2899

    
2900
        src[x-1*stride] = p1;
2901
        src[x+0*stride] = p2;
2902

    
2903
        ad1= FFABS(d1)>>1;
2904

    
2905
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2906

    
2907
        src[x-2*stride] = p0 - d2;
2908
        src[x+  stride] = p3 + d2;
2909
    }
2910
    }
2911
}
2912

    
2913
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2914
    if(CONFIG_ANY_H263) {
2915
    int y;
2916
    const int strength= ff_h263_loop_filter_strength[qscale];
2917

    
2918
    for(y=0; y<8; y++){
2919
        int d1, d2, ad1;
2920
        int p0= src[y*stride-2];
2921
        int p1= src[y*stride-1];
2922
        int p2= src[y*stride+0];
2923
        int p3= src[y*stride+1];
2924
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2925

    
2926
        if     (d<-2*strength) d1= 0;
2927
        else if(d<-  strength) d1=-2*strength - d;
2928
        else if(d<   strength) d1= d;
2929
        else if(d< 2*strength) d1= 2*strength - d;
2930
        else                   d1= 0;
2931

    
2932
        p1 += d1;
2933
        p2 -= d1;
2934
        if(p1&256) p1= ~(p1>>31);
2935
        if(p2&256) p2= ~(p2>>31);
2936

    
2937
        src[y*stride-1] = p1;
2938
        src[y*stride+0] = p2;
2939

    
2940
        ad1= FFABS(d1)>>1;
2941

    
2942
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2943

    
2944
        src[y*stride-2] = p0 - d2;
2945
        src[y*stride+1] = p3 + d2;
2946
    }
2947
    }
2948
}
2949

    
2950
static void h261_loop_filter_c(uint8_t *src, int stride){
2951
    int x,y,xy,yz;
2952
    int temp[64];
2953

    
2954
    for(x=0; x<8; x++){
2955
        temp[x      ] = 4*src[x           ];
2956
        temp[x + 7*8] = 4*src[x + 7*stride];
2957
    }
2958
    for(y=1; y<7; y++){
2959
        for(x=0; x<8; x++){
2960
            xy = y * stride + x;
2961
            yz = y * 8 + x;
2962
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2963
        }
2964
    }
2965

    
2966
    for(y=0; y<8; y++){
2967
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2968
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2969
        for(x=1; x<7; x++){
2970
            xy = y * stride + x;
2971
            yz = y * 8 + x;
2972
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2973
        }
2974
    }
2975
}
2976

    
2977
static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2978
{
2979
    int i, d;
2980
    for( i = 0; i < 4; i++ ) {
2981
        if( tc0[i] < 0 ) {
2982
            pix += 4*ystride;
2983
            continue;
2984
        }
2985
        for( d = 0; d < 4; d++ ) {
2986
            const int p0 = pix[-1*xstride];
2987
            const int p1 = pix[-2*xstride];
2988
            const int p2 = pix[-3*xstride];
2989
            const int q0 = pix[0];
2990
            const int q1 = pix[1*xstride];
2991
            const int q2 = pix[2*xstride];
2992

    
2993
            if( FFABS( p0 - q0 ) < alpha &&
2994
                FFABS( p1 - p0 ) < beta &&
2995
                FFABS( q1 - q0 ) < beta ) {
2996

    
2997
                int tc = tc0[i];
2998
                int i_delta;
2999

    
3000
                if( FFABS( p2 - p0 ) < beta ) {
3001
                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3002
                    tc++;
3003
                }
3004
                if( FFABS( q2 - q0 ) < beta ) {
3005
                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3006
                    tc++;
3007
                }
3008

    
3009
                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3010
                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3011
                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3012
            }
3013
            pix += ystride;
3014
        }
3015
    }
3016
}
3017
static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3018
{
3019
    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3020
}
3021
static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3022
{
3023
    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3024
}
3025

    
3026
static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3027
{
3028
    int d;
3029
    for( d = 0; d < 16; d++ ) {
3030
        const int p2 = pix[-3*xstride];
3031
        const int p1 = pix[-2*xstride];
3032
        const int p0 = pix[-1*xstride];
3033

    
3034
        const int q0 = pix[ 0*xstride];
3035
        const int q1 = pix[ 1*xstride];
3036
        const int q2 = pix[ 2*xstride];
3037

    
3038
        if( FFABS( p0 - q0 ) < alpha &&
3039
            FFABS( p1 - p0 ) < beta &&
3040
            FFABS( q1 - q0 ) < beta ) {
3041

    
3042
            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3043
                if( FFABS( p2 - p0 ) < beta)
3044
                {
3045
                    const int p3 = pix[-4*xstride];
3046
                    /* p0', p1', p2' */
3047
                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3048
                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3049
                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3050
                } else {
3051
                    /* p0' */
3052
                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3053
                }
3054
                if( FFABS( q2 - q0 ) < beta)
3055
                {
3056
                    const int q3 = pix[3*xstride];
3057
                    /* q0', q1', q2' */
3058
                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3059
                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3060
                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3061
                } else {
3062
                    /* q0' */
3063
                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3064
                }
3065
            }else{
3066
                /* p0', q0' */
3067
                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3068
                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3069
            }
3070
        }
3071
        pix += ystride;
3072
    }
3073
}
3074
static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3075
{
3076
    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3077
}
3078
static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3079
{
3080
    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3081
}
3082

    
3083
static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3084
{
3085
    int i, d;
3086
    for( i = 0; i < 4; i++ ) {
3087
        const int tc = tc0[i];
3088
        if( tc <= 0 ) {
3089
            pix += 2*ystride;
3090
            continue;
3091
        }
3092
        for( d = 0; d < 2; d++ ) {
3093
            const int p0 = pix[-1*xstride];
3094
            const int p1 = pix[-2*xstride];
3095
            const int q0 = pix[0];
3096
            const int q1 = pix[1*xstride];
3097

    
3098
            if( FFABS( p0 - q0 ) < alpha &&
3099
                FFABS( p1 - p0 ) < beta &&
3100
                FFABS( q1 - q0 ) < beta ) {
3101

    
3102
                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3103

    
3104
                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3105
                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3106
            }
3107
            pix += ystride;
3108
        }
3109
    }
3110
}
3111
static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3112
{
3113
    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3114
}
3115
static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3116
{
3117
    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3118
}
3119

    
3120
static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3121
{
3122
    int d;
3123
    for( d = 0; d < 8; d++ ) {
3124
        const int p0 = pix[-1*xstride];
3125
        const int p1 = pix[-2*xstride];
3126
        const int q0 = pix[0];
3127
        const int q1 = pix[1*xstride];
3128

    
3129
        if( FFABS( p0 - q0 ) < alpha &&
3130
            FFABS( p1 - p0 ) < beta &&
3131
            FFABS( q1 - q0 ) < beta ) {
3132

    
3133
            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3134
            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3135
        }
3136
        pix += ystride;
3137
    }
3138
}
3139
static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3140
{
3141
    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3142
}
3143
static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3144
{
3145
    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3146
}
3147

    
3148
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3149
{
3150
    int s, i;
3151

    
3152
    s = 0;
3153
    for(i=0;i<h;i++) {
3154
        s += abs(pix1[0] - pix2[0]);
3155
        s += abs(pix1[1] - pix2[1]);
3156
        s += abs(pix1[2] - pix2[2]);
3157
        s += abs(pix1[3] - pix2[3]);
3158
        s += abs(pix1[4] - pix2[4]);
3159
        s += abs(pix1[5] - pix2[5]);
3160
        s += abs(pix1[6] - pix2[6]);
3161
        s += abs(pix1[7] - pix2[7]);
3162
        s += abs(pix1[8] - pix2[8]);
3163
        s += abs(pix1[9] - pix2[9]);
3164
        s += abs(pix1[10] - pix2[10]);
3165
        s += abs(pix1[11] - pix2[11]);
3166
        s += abs(pix1[12] - pix2[12]);
3167
        s += abs(pix1[13] - pix2[13]);
3168
        s += abs(pix1[14] - pix2[14]);
3169
        s += abs(pix1[15] - pix2[15]);
3170
        pix1 += line_size;
3171
        pix2 += line_size;
3172
    }
3173
    return s;
3174
}
3175

    
3176
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3177
{
3178
    int s, i;
3179

    
3180
    s = 0;
3181
    for(i=0;i<h;i++) {
3182
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3183
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3184
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3185
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3186
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3187
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3188
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3189
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3190
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3191
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3192
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3193
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3194
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3195
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3196
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3197
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3198
        pix1 += line_size;
3199
        pix2 += line_size;
3200
    }
3201
    return s;
3202
}
3203

    
3204
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3205
{
3206
    int s, i;
3207
    uint8_t *pix3 = pix2 + line_size;
3208

    
3209
    s = 0;
3210
    for(i=0;i<h;i++) {
3211
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3212
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3213
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3214
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3215
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3216
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3217
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3218
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3219
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3220
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3221
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3222
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3223
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3224
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3225
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3226
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3227
        pix1 += line_size;
3228
        pix2 += line_size;
3229
        pix3 += line_size;
3230
    }
3231
    return s;
3232
}
3233

    
3234
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3235
{
3236
    int s, i;
3237
    uint8_t *pix3 = pix2 + line_size;
3238

    
3239
    s = 0;
3240
    for(i=0;i<h;i++) {
3241
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3242
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3243
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3244
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3245
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3246
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3247
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3248
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3249
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3250
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3251
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3252
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3253
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3254
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3255
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3256
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3257
        pix1 += line_size;
3258
        pix2 += line_size;
3259
        pix3 += line_size;
3260
    }
3261
    return s;
3262
}
3263

    
3264
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3265
{
3266
    int s, i;
3267

    
3268
    s = 0;
3269
    for(i=0;i<h;i++) {
3270
        s += abs(pix1[0] - pix2[0]);
3271
        s += abs(pix1[1] - pix2[1]);
3272
        s += abs(pix1[2] - pix2[2]);
3273
        s += abs(pix1[3] - pix2[3]);
3274
        s += abs(pix1[4] - pix2[4]);
3275
        s += abs(pix1[5] - pix2[5]);
3276
        s += abs(pix1[6] - pix2[6]);
3277
        s += abs(pix1[7] - pix2[7]);
3278
        pix1 += line_size;
3279
        pix2 += line_size;
3280
    }
3281
    return s;
3282
}
3283

    
3284
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3285
{
3286
    int s, i;
3287

    
3288
    s = 0;
3289
    for(i=0;i<h;i++) {
3290
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3291
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3292
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3293
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3294
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3295
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3296
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3297
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3298
        pix1 += line_size;
3299
        pix2 += line_size;
3300
    }
3301
    return s;
3302
}
3303

    
3304
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3305
{
3306
    int s, i;
3307
    uint8_t *pix3 = pix2 + line_size;
3308

    
3309
    s = 0;
3310
    for(i=0;i<h;i++) {
3311
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3312
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3313
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3314
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3315
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3316
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3317
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3318
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3319
        pix1 += line_size;
3320
        pix2 += line_size;
3321
        pix3 += line_size;
3322
    }
3323
    return s;
3324
}
3325

    
3326
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3327
{
3328
    int s, i;
3329
    uint8_t *pix3 = pix2 + line_size;
3330

    
3331
    s = 0;
3332
    for(i=0;i<h;i++) {
3333
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3334
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3335
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3336
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3337
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3338
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3339
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3340
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3341
        pix1 += line_size;
3342
        pix2 += line_size;
3343
        pix3 += line_size;
3344
    }
3345
    return s;
3346
}
3347

    
3348
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3349
    MpegEncContext *c = v;
3350
    int score1=0;
3351
    int score2=0;
3352
    int x,y;
3353

    
3354
    for(y=0; y<h; y++){
3355
        for(x=0; x<16; x++){
3356
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3357
        }
3358
        if(y+1<h){
3359
            for(x=0; x<15; x++){
3360
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3361
                             - s1[x+1] + s1[x+1+stride])
3362
                        -FFABS(  s2[x  ] - s2[x  +stride]
3363
                             - s2[x+1] + s2[x+1+stride]);
3364
            }
3365
        }
3366
        s1+= stride;
3367
        s2+= stride;
3368
    }
3369

    
3370
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3371
    else  return score1 + FFABS(score2)*8;
3372
}
3373

    
3374
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3375
    MpegEncContext *c = v;
3376
    int score1=0;
3377
    int score2=0;
3378
    int x,y;
3379

    
3380
    for(y=0; y<h; y++){
3381
        for(x=0; x<8; x++){
3382
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3383
        }
3384
        if(y+1<h){
3385
            for(x=0; x<7; x++){
3386
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3387
                             - s1[x+1] + s1[x+1+stride])
3388
                        -FFABS(  s2[x  ] - s2[x  +stride]
3389
                             - s2[x+1] + s2[x+1+stride]);
3390
            }
3391
        }
3392
        s1+= stride;
3393
        s2+= stride;
3394
    }
3395

    
3396
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3397
    else  return score1 + FFABS(score2)*8;
3398
}
3399

    
3400
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3401
    int i;
3402
    unsigned int sum=0;
3403

    
3404
    for(i=0; i<8*8; i++){
3405
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3406
        int w= weight[i];
3407
        b>>= RECON_SHIFT;
3408
        assert(-512<b && b<512);
3409

    
3410
        sum += (w*b)*(w*b)>>4;
3411
    }
3412
    return sum>>2;
3413
}
3414

    
3415
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3416
    int i;
3417

    
3418
    for(i=0; i<8*8; i++){
3419
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3420
    }
3421
}
3422

    
3423
/**
3424
 * permutes an 8x8 block.
3425
 * @param block the block which will be permuted according to the given permutation vector
3426
 * @param permutation the permutation vector
3427
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3428
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3429
 *                  (inverse) permutated to scantable order!
3430
 */
3431
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3432
{
3433
    int i;
3434
    DCTELEM temp[64];
3435

    
3436
    if(last<=0) return;
3437
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3438

    
3439
    for(i=0; i<=last; i++){
3440
        const int j= scantable[i];
3441
        temp[j]= block[j];
3442
        block[j]=0;
3443
    }
3444

    
3445
    for(i=0; i<=last; i++){
3446
        const int j= scantable[i];
3447
        const int perm_j= permutation[j];
3448
        block[perm_j]= temp[j];
3449
    }
3450
}
3451

    
3452
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3453
    return 0;
3454
}
3455

    
3456
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3457
    int i;
3458

    
3459
    memset(cmp, 0, sizeof(void*)*6);
3460

    
3461
    for(i=0; i<6; i++){
3462
        switch(type&0xFF){
3463
        case FF_CMP_SAD:
3464
            cmp[i]= c->sad[i];
3465
            break;
3466
        case FF_CMP_SATD:
3467
            cmp[i]= c->hadamard8_diff[i];
3468
            break;
3469
        case FF_CMP_SSE:
3470
            cmp[i]= c->sse[i];
3471
            break;
3472
        case FF_CMP_DCT:
3473
            cmp[i]= c->dct_sad[i];
3474
            break;
3475
        case FF_CMP_DCT264:
3476
            cmp[i]= c->dct264_sad[i];
3477
            break;
3478
        case FF_CMP_DCTMAX:
3479
            cmp[i]= c->dct_max[i];
3480
            break;
3481
        case FF_CMP_PSNR:
3482
            cmp[i]= c->quant_psnr[i];
3483
            break;
3484
        case FF_CMP_BIT:
3485
            cmp[i]= c->bit[i];
3486
            break;
3487
        case FF_CMP_RD:
3488
            cmp[i]= c->rd[i];
3489
            break;
3490
        case FF_CMP_VSAD:
3491
            cmp[i]= c->vsad[i];
3492
            break;
3493
        case FF_CMP_VSSE:
3494
            cmp[i]= c->vsse[i];
3495
            break;
3496
        case FF_CMP_ZERO:
3497
            cmp[i]= zero_cmp;
3498
            break;
3499
        case FF_CMP_NSSE:
3500
            cmp[i]= c->nsse[i];
3501
            break;
3502
#if CONFIG_SNOW_ENCODER
3503
        case FF_CMP_W53:
3504
            cmp[i]= c->w53[i];
3505
            break;
3506
        case FF_CMP_W97:
3507
            cmp[i]= c->w97[i];
3508
            break;
3509
#endif
3510
        default:
3511
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3512
        }
3513
    }
3514
}
3515

    
3516
static void clear_block_c(DCTELEM *block)
3517
{
3518
    memset(block, 0, sizeof(DCTELEM)*64);
3519
}
3520

    
3521
/**
3522
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3523
 */
3524
static void clear_blocks_c(DCTELEM *blocks)
3525
{
3526
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3527
}
3528

    
3529
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3530
    long i;
3531
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3532
        long a = *(long*)(src+i);
3533
        long b = *(long*)(dst+i);
3534
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3535
    }
3536
    for(; i<w; i++)
3537
        dst[i+0] += src[i+0];
3538
}
3539

    
3540
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3541
    long i;
3542
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3543
        long a = *(long*)(src1+i);
3544
        long b = *(long*)(src2+i);
3545
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3546
    }
3547
    for(; i<w; i++)
3548
        dst[i] = src1[i]+src2[i];
3549
}
3550

    
3551
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3552
    long i;
3553
#if !HAVE_FAST_UNALIGNED
3554
    if((long)src2 & (sizeof(long)-1)){
3555
        for(i=0; i+7<w; i+=8){
3556
            dst[i+0] = src1[i+0]-src2[i+0];
3557
            dst[i+1] = src1[i+1]-src2[i+1];
3558
            dst[i+2] = src1[i+2]-src2[i+2];
3559
            dst[i+3] = src1[i+3]-src2[i+3];
3560
            dst[i+4] = src1[i+4]-src2[i+4];
3561
            dst[i+5] = src1[i+5]-src2[i+5];
3562
            dst[i+6] = src1[i+6]-src2[i+6];
3563
            dst[i+7] = src1[i+7]-src2[i+7];
3564
        }
3565
    }else
3566
#endif
3567
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3568
        long a = *(long*)(src1+i);
3569
        long b = *(long*)(src2+i);
3570
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3571
    }
3572
    for(; i<w; i++)
3573
        dst[i+0] = src1[i+0]-src2[i+0];
3574
}
3575

    
3576
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3577
    int i;
3578
    uint8_t l, lt;
3579

    
3580
    l= *left;
3581
    lt= *left_top;
3582

    
3583
    for(i=0; i<w; i++){
3584
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3585
        lt= src1[i];
3586
        dst[i]= l;
3587
    }
3588

    
3589
    *left= l;
3590
    *left_top= lt;
3591
}
3592

    
3593
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3594
    int i;
3595
    uint8_t l, lt;
3596

    
3597
    l= *left;
3598
    lt= *left_top;
3599

    
3600
    for(i=0; i<w; i++){
3601
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3602
        lt= src1[i];
3603
        l= src2[i];
3604
        dst[i]= l - pred;
3605
    }
3606

    
3607
    *left= l;
3608
    *left_top= lt;
3609
}
3610

    
3611
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3612
    int i;
3613

    
3614
    for(i=0; i<w-1; i++){
3615
        acc+= src[i];
3616
        dst[i]= acc;
3617
        i++;
3618
        acc+= src[i];
3619
        dst[i]= acc;
3620
    }
3621

    
3622
    for(; i<w; i++){
3623
        acc+= src[i];
3624
        dst[i]= acc;
3625
    }
3626

    
3627
    return acc;
3628
}
3629

    
3630
#if HAVE_BIGENDIAN
3631
#define B 3
3632
#define G 2
3633
#define R 1
3634
#else
3635
#define B 0
3636
#define G 1
3637
#define R 2
3638
#endif
3639
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue){
3640
    int i;
3641
    int r,g,b;
3642
    r= *red;
3643
    g= *green;
3644
    b= *blue;
3645

    
3646
    for(i=0; i<w; i++){
3647
        b+= src[4*i+B];
3648
        g+= src[4*i+G];
3649
        r+= src[4*i+R];
3650

    
3651
        dst[4*i+B]= b;
3652
        dst[4*i+G]= g;
3653
        dst[4*i+R]= r;
3654
    }
3655

    
3656
    *red= r;
3657
    *green= g;
3658
    *blue= b;
3659
}
3660
#undef B
3661
#undef G
3662
#undef R
3663

    
3664
#define BUTTERFLY2(o1,o2,i1,i2) \
3665
o1= (i1)+(i2);\
3666
o2= (i1)-(i2);
3667

    
3668
#define BUTTERFLY1(x,y) \
3669
{\
3670
    int a,b;\
3671
    a= x;\
3672
    b= y;\
3673
    x= a+b;\
3674
    y= a-b;\
3675
}
3676

    
3677
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3678

    
3679
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3680
    int i;
3681
    int temp[64];
3682
    int sum=0;
3683

    
3684
    assert(h==8);
3685

    
3686
    for(i=0; i<8; i++){
3687
        //FIXME try pointer walks
3688
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3689
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3690
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3691
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3692

    
3693
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3694
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3695
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3696
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3697

    
3698
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3699
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3700
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3701
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3702
    }
3703

    
3704
    for(i=0; i<8; i++){
3705
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3706
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3707
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3708
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3709

    
3710
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3711
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3712
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3713
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3714

    
3715
        sum +=
3716
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3717
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3718
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3719
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3720
    }
3721
#if 0
3722
static int maxi=0;
3723
if(sum>maxi){
3724
    maxi=sum;
3725
    printf("MAX:%d\n", maxi);
3726
}
3727
#endif
3728
    return sum;
3729
}
3730

    
3731
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3732
    int i;
3733
    int temp[64];
3734
    int sum=0;
3735

    
3736
    assert(h==8);
3737

    
3738
    for(i=0; i<8; i++){
3739
        //FIXME try pointer walks
3740
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3741
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3742
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3743
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3744

    
3745
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3746
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3747
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3748
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3749

    
3750
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3751
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3752
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3753
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3754
    }
3755

    
3756
    for(i=0; i<8; i++){
3757
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3758
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3759
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3760
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3761

    
3762
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3763
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3764
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3765
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3766

    
3767
        sum +=
3768
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3769
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3770
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3771
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3772
    }
3773

    
3774
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3775

    
3776
    return sum;
3777
}
3778

    
3779
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3780
    MpegEncContext * const s= (MpegEncContext *)c;
3781
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3782
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3783

    
3784
    assert(h==8);
3785

    
3786
    s->dsp.diff_pixels(temp, src1, src2, stride);
3787
    s->dsp.fdct(temp);
3788
    return s->dsp.sum_abs_dctelem(temp);
3789
}
3790

    
3791
#if CONFIG_GPL
3792
#define DCT8_1D {\
3793
    const int s07 = SRC(0) + SRC(7);\
3794
    const int s16 = SRC(1) + SRC(6);\
3795
    const int s25 = SRC(2) + SRC(5);\
3796
    const int s34 = SRC(3) + SRC(4);\
3797
    const int a0 = s07 + s34;\
3798
    const int a1 = s16 + s25;\
3799
    const int a2 = s07 - s34;\
3800
    const int a3 = s16 - s25;\
3801
    const int d07 = SRC(0) - SRC(7);\
3802
    const int d16 = SRC(1) - SRC(6);\
3803
    const int d25 = SRC(2) - SRC(5);\
3804
    const int d34 = SRC(3) - SRC(4);\
3805
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3806
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3807
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3808
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3809
    DST(0,  a0 + a1     ) ;\
3810
    DST(1,  a4 + (a7>>2)) ;\
3811
    DST(2,  a2 + (a3>>1)) ;\
3812
    DST(3,  a5 + (a6>>2)) ;\
3813
    DST(4,  a0 - a1     ) ;\
3814
    DST(5,  a6 - (a5>>2)) ;\
3815
    DST(6, (a2>>1) - a3 ) ;\
3816
    DST(7, (a4>>2) - a7 ) ;\
3817
}
3818

    
3819
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3820
    MpegEncContext * const s= (MpegEncContext *)c;
3821
    DCTELEM dct[8][8];
3822
    int i;
3823
    int sum=0;
3824

    
3825
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3826

    
3827
#define SRC(x) dct[i][x]
3828
#define DST(x,v) dct[i][x]= v
3829
    for( i = 0; i < 8; i++ )
3830
        DCT8_1D
3831
#undef SRC
3832
#undef DST
3833

    
3834
#define SRC(x) dct[x][i]
3835
#define DST(x,v) sum += FFABS(v)
3836
    for( i = 0; i < 8; i++ )
3837
        DCT8_1D
3838
#undef SRC
3839
#undef DST
3840
    return sum;
3841
}
3842
#endif
3843

    
3844
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3845
    MpegEncContext * const s= (MpegEncContext *)c;
3846
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3847
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3848
    int sum=0, i;
3849

    
3850
    assert(h==8);
3851

    
3852
    s->dsp.diff_pixels(temp, src1, src2, stride);
3853
    s->dsp.fdct(temp);
3854

    
3855
    for(i=0; i<64; i++)
3856
        sum= FFMAX(sum, FFABS(temp[i]));
3857

    
3858
    return sum;
3859
}
3860

    
3861
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3862
    MpegEncContext * const s= (MpegEncContext *)c;
3863
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3864
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3865
    DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3866
    int sum=0, i;
3867

    
3868
    assert(h==8);
3869
    s->mb_intra=0;
3870

    
3871
    s->dsp.diff_pixels(temp, src1, src2, stride);
3872

    
3873
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3874

    
3875
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3876
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3877
    ff_simple_idct(temp); //FIXME
3878

    
3879
    for(i=0; i<64; i++)
3880
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3881

    
3882
    return sum;
3883
}
3884

    
3885
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3886
    MpegEncContext * const s= (MpegEncContext *)c;
3887
    const uint8_t *scantable= s->intra_scantable.permutated;
3888
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3889
    DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3890
    DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3891
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3892
    uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3893
    uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3894
    int i, last, run, bits, level, distortion, start_i;
3895
    const int esc_length= s->ac_esc_length;
3896
    uint8_t * length;
3897
    uint8_t * last_length;
3898

    
3899
    assert(h==8);
3900

    
3901
    copy_block8(lsrc1, src1, 8, stride, 8);
3902
    copy_block8(lsrc2, src2, 8, stride, 8);
3903

    
3904
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3905

    
3906
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3907

    
3908
    bits=0;
3909

    
3910
    if (s->mb_intra) {
3911
        start_i = 1;
3912
        length     = s->intra_ac_vlc_length;
3913
        last_length= s->intra_ac_vlc_last_length;
3914
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3915
    } else {
3916
        start_i = 0;
3917
        length     = s->inter_ac_vlc_length;
3918
        last_length= s->inter_ac_vlc_last_length;
3919
    }
3920

    
3921
    if(last>=start_i){
3922
        run=0;
3923
        for(i=start_i; i<last; i++){
3924
            int j= scantable[i];
3925
            level= temp[j];
3926

    
3927
            if(level){
3928
                level+=64;
3929
                if((level&(~127)) == 0){
3930
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3931
                }else
3932
                    bits+= esc_length;
3933
                run=0;
3934
            }else
3935
                run++;
3936
        }
3937
        i= scantable[last];
3938

    
3939
        level= temp[i] + 64;
3940

    
3941
        assert(level - 64);
3942

    
3943
        if((level&(~127)) == 0){
3944
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3945
        }else
3946
            bits+= esc_length;
3947

    
3948
    }
3949

    
3950
    if(last>=0){
3951
        if(s->mb_intra)
3952
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3953
        else
3954
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3955
    }
3956

    
3957
    s->dsp.idct_add(lsrc2, 8, temp);
3958

    
3959
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3960

    
3961
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3962
}
3963

    
3964
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3965
    MpegEncContext * const s= (MpegEncContext *)c;
3966
    const uint8_t *scantable= s->intra_scantable.permutated;
3967
    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3968
    DCTELEM * const temp= (DCTELEM*)aligned_temp;
3969
    int i, last, run, bits, level, start_i;
3970
    const int esc_length= s->ac_esc_length;
3971
    uint8_t * length;
3972
    uint8_t * last_length;
3973

    
3974
    assert(h==8);
3975

    
3976
    s->dsp.diff_pixels(temp, src1, src2, stride);
3977

    
3978
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3979

    
3980
    bits=0;
3981

    
3982
    if (s->mb_intra) {
3983
        start_i = 1;
3984
        length     = s->intra_ac_vlc_length;
3985
        last_length= s->intra_ac_vlc_last_length;
3986
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3987
    } else {
3988
        start_i = 0;
3989
        length     = s->inter_ac_vlc_length;
3990
        last_length= s->inter_ac_vlc_last_length;
3991
    }
3992

    
3993
    if(last>=start_i){
3994
        run=0;
3995
        for(i=start_i; i<last; i++){
3996
            int j= scantable[i];
3997
            level= temp[j];
3998

    
3999
            if(level){
4000
                level+=64;
4001
                if((level&(~127)) == 0){
4002
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
4003
                }else
4004
                    bits+= esc_length;
4005
                run=0;
4006
            }else
4007
                run++;
4008
        }
4009
        i= scantable[last];
4010

    
4011
        level= temp[i] + 64;
4012

    
4013
        assert(level - 64);
4014

    
4015
        if((level&(~127)) == 0){
4016
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4017
        }else
4018
            bits+= esc_length;
4019
    }
4020

    
4021
    return bits;
4022
}
4023

    
4024
#define VSAD_INTRA(size) \
4025
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4026
    int score=0;                                                                                            \
4027
    int x,y;                                                                                                \
4028
                                                                                                            \
4029
    for(y=1; y<h; y++){                                                                                     \
4030
        for(x=0; x<size; x+=4){                                                                             \
4031
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4032
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4033
        }                                                                                                   \
4034
        s+= stride;                                                                                         \
4035
    }                                                                                                       \
4036
                                                                                                            \
4037
    return score;                                                                                           \
4038
}
4039
VSAD_INTRA(8)
4040
VSAD_INTRA(16)
4041

    
4042
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4043
    int score=0;
4044
    int x,y;
4045

    
4046
    for(y=1; y<h; y++){
4047
        for(x=0; x<16; x++){
4048
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4049
        }
4050
        s1+= stride;
4051
        s2+= stride;
4052
    }
4053

    
4054
    return score;
4055
}
4056

    
4057
#define SQ(a) ((a)*(a))
4058
#define VSSE_INTRA(size) \
4059
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4060
    int score=0;                                                                                            \
4061
    int x,y;                                                                                                \
4062
                                                                                                            \
4063
    for(y=1; y<h; y++){                                                                                     \
4064
        for(x=0; x<size; x+=4){                                                                               \
4065
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4066
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4067
        }                                                                                                   \
4068
        s+= stride;                                                                                         \
4069
    }                                                                                                       \
4070
                                                                                                            \
4071
    return score;                                                                                           \
4072
}
4073
VSSE_INTRA(8)
4074
VSSE_INTRA(16)
4075

    
4076
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4077
    int score=0;
4078
    int x,y;
4079

    
4080
    for(y=1; y<h; y++){
4081
        for(x=0; x<16; x++){
4082
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4083
        }
4084
        s1+= stride;
4085
        s2+= stride;
4086
    }
4087

    
4088
    return score;
4089
}
4090

    
4091
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4092
                               int size){
4093
    int score=0;
4094
    int i;
4095
    for(i=0; i<size; i++)
4096
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4097
    return score;
4098
}
4099

    
4100
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4101
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4102
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4103
#if CONFIG_GPL
4104
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4105
#endif
4106
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4107
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4108
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4109
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4110

    
4111
static void vector_fmul_c(float *dst, const float *src, int len){
4112
    int i;
4113
    for(i=0; i<len; i++)
4114
        dst[i] *= src[i];
4115
}
4116

    
4117
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4118
    int i;
4119
    src1 += len-1;
4120
    for(i=0; i<len; i++)
4121
        dst[i] = src0[i] * src1[-i];
4122
}
4123

    
4124
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4125
    int i;
4126
    for(i=0; i<len; i++)
4127
        dst[i] = src0[i] * src1[i] + src2[i];
4128
}
4129

    
4130
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4131
    int i,j;
4132
    dst += len;
4133
    win += len;
4134
    src0+= len;
4135
    for(i=-len, j=len-1; i<0; i++, j--) {
4136
        float s0 = src0[i];
4137
        float s1 = src1[j];
4138
        float wi = win[i];
4139
        float wj = win[j];
4140
        dst[i] = s0*wj - s1*wi + add_bias;
4141
        dst[j] = s0*wi + s1*wj + add_bias;
4142
    }
4143
}
4144

    
4145
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4146
                                 int len)
4147
{
4148
    int i;
4149
    for (i = 0; i < len; i++)
4150
        dst[i] = src[i] * mul;
4151
}
4152

    
4153
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4154
                                      const float **sv, float mul, int len)
4155
{
4156
    int i;
4157
    for (i = 0; i < len; i += 2, sv++) {
4158
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4159
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4160
    }
4161
}
4162

    
4163
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4164
                                      const float **sv, float mul, int len)
4165
{
4166
    int i;
4167
    for (i = 0; i < len; i += 4, sv++) {
4168
        dst[i  ] = src[i  ] * sv[0][0] * mul;
4169
        dst[i+1] = src[i+1] * sv[0][1] * mul;
4170
        dst[i+2] = src[i+2] * sv[0][2] * mul;
4171
        dst[i+3] = src[i+3] * sv[0][3] * mul;
4172
    }
4173
}
4174

    
4175
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4176
                               int len)
4177
{
4178
    int i;
4179
    for (i = 0; i < len; i += 2, sv++) {
4180
        dst[i  ] = sv[0][0] * mul;
4181
        dst[i+1] = sv[0][1] * mul;
4182
    }
4183
}
4184

    
4185
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4186
                               int len)
4187
{
4188
    int i;
4189
    for (i = 0; i < len; i += 4, sv++) {
4190
        dst[i  ] = sv[0][0] * mul;
4191
        dst[i+1] = sv[0][1] * mul;
4192
        dst[i+2] = sv[0][2] * mul;
4193
        dst[i+3] = sv[0][3] * mul;
4194
    }
4195
}
4196

    
4197
static void butterflies_float_c(float *restrict v1, float *restrict v2,
4198
                                int len)
4199
{
4200
    int i;
4201
    for (i = 0; i < len; i++) {
4202
        float t = v1[i] - v2[i];
4203
        v1[i] += v2[i];
4204
        v2[i] = t;
4205
    }
4206
}
4207

    
4208
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4209
{
4210
    float p = 0.0;
4211
    int i;
4212

    
4213
    for (i = 0; i < len; i++)
4214
        p += v1[i] * v2[i];
4215

    
4216
    return p;
4217
}
4218

    
4219
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4220
    int i;
4221
    for(i=0; i<len; i++)
4222
        dst[i] = src[i] * mul;
4223
}
4224

    
4225
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4226
                   uint32_t maxi, uint32_t maxisign)
4227
{
4228

    
4229
    if(a > mini) return mini;
4230
    else if((a^(1<<31)) > maxisign) return maxi;
4231
    else return a;