Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 3b636f21

History | View | Annotate | Download (161 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "mpegvideo.h"
37
#include "config.h"
38
#include "lpc.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42
#include "vp8dsp.h"
43

    
44
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45
uint32_t ff_squareTbl[512] = {0, };
46

    
47
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48
#define pb_7f (~0UL/255 * 0x7f)
49
#define pb_80 (~0UL/255 * 0x80)
50

    
51
const uint8_t ff_zigzag_direct[64] = {
52
    0,   1,  8, 16,  9,  2,  3, 10,
53
    17, 24, 32, 25, 18, 11,  4,  5,
54
    12, 19, 26, 33, 40, 48, 41, 34,
55
    27, 20, 13,  6,  7, 14, 21, 28,
56
    35, 42, 49, 56, 57, 50, 43, 36,
57
    29, 22, 15, 23, 30, 37, 44, 51,
58
    58, 59, 52, 45, 38, 31, 39, 46,
59
    53, 60, 61, 54, 47, 55, 62, 63
60
};
61

    
62
/* Specific zigzag scan for 248 idct. NOTE that unlike the
63
   specification, we interleave the fields */
64
const uint8_t ff_zigzag248_direct[64] = {
65
     0,  8,  1,  9, 16, 24,  2, 10,
66
    17, 25, 32, 40, 48, 56, 33, 41,
67
    18, 26,  3, 11,  4, 12, 19, 27,
68
    34, 42, 49, 57, 50, 58, 35, 43,
69
    20, 28,  5, 13,  6, 14, 21, 29,
70
    36, 44, 51, 59, 52, 60, 37, 45,
71
    22, 30,  7, 15, 23, 31, 38, 46,
72
    53, 61, 54, 62, 39, 47, 55, 63,
73
};
74

    
75
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77

    
78
const uint8_t ff_alternate_horizontal_scan[64] = {
79
    0,  1,   2,  3,  8,  9, 16, 17,
80
    10, 11,  4,  5,  6,  7, 15, 14,
81
    13, 12, 19, 18, 24, 25, 32, 33,
82
    26, 27, 20, 21, 22, 23, 28, 29,
83
    30, 31, 34, 35, 40, 41, 48, 49,
84
    42, 43, 36, 37, 38, 39, 44, 45,
85
    46, 47, 50, 51, 56, 57, 58, 59,
86
    52, 53, 54, 55, 60, 61, 62, 63,
87
};
88

    
89
const uint8_t ff_alternate_vertical_scan[64] = {
90
    0,  8,  16, 24,  1,  9,  2, 10,
91
    17, 25, 32, 40, 48, 56, 57, 49,
92
    41, 33, 26, 18,  3, 11,  4, 12,
93
    19, 27, 34, 42, 50, 58, 35, 43,
94
    51, 59, 20, 28,  5, 13,  6, 14,
95
    21, 29, 36, 44, 52, 60, 37, 45,
96
    53, 61, 22, 30,  7, 15, 23, 31,
97
    38, 46, 54, 62, 39, 47, 55, 63,
98
};
99

    
100
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
101
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
102
const uint32_t ff_inverse[257]={
103
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
104
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
105
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
106
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
107
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
108
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
109
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
110
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
111
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
112
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
113
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
114
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
115
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
116
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
117
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
118
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
119
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
120
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
121
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
122
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
123
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
124
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
125
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
126
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
127
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
128
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
129
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
130
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
131
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
132
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
133
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
134
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
135
  16777216
136
};
137

    
138
/* Input permutation for the simple_idct_mmx */
139
static const uint8_t simple_mmx_permutation[64]={
140
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
141
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
142
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
143
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
144
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
145
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
146
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
147
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
148
};
149

    
150
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
151

    
152
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
153
    int i;
154
    int end;
155

    
156
    st->scantable= src_scantable;
157

    
158
    for(i=0; i<64; i++){
159
        int j;
160
        j = src_scantable[i];
161
        st->permutated[i] = permutation[j];
162
#if ARCH_PPC
163
        st->inverse[j] = i;
164
#endif
165
    }
166

    
167
    end=-1;
168
    for(i=0; i<64; i++){
169
        int j;
170
        j = st->permutated[i];
171
        if(j>end) end=j;
172
        st->raster_end[i]= end;
173
    }
174
}
175

    
176
static int pix_sum_c(uint8_t * pix, int line_size)
177
{
178
    int s, i, j;
179

    
180
    s = 0;
181
    for (i = 0; i < 16; i++) {
182
        for (j = 0; j < 16; j += 8) {
183
            s += pix[0];
184
            s += pix[1];
185
            s += pix[2];
186
            s += pix[3];
187
            s += pix[4];
188
            s += pix[5];
189
            s += pix[6];
190
            s += pix[7];
191
            pix += 8;
192
        }
193
        pix += line_size - 16;
194
    }
195
    return s;
196
}
197

    
198
static int pix_norm1_c(uint8_t * pix, int line_size)
199
{
200
    int s, i, j;
201
    uint32_t *sq = ff_squareTbl + 256;
202

    
203
    s = 0;
204
    for (i = 0; i < 16; i++) {
205
        for (j = 0; j < 16; j += 8) {
206
#if 0
207
            s += sq[pix[0]];
208
            s += sq[pix[1]];
209
            s += sq[pix[2]];
210
            s += sq[pix[3]];
211
            s += sq[pix[4]];
212
            s += sq[pix[5]];
213
            s += sq[pix[6]];
214
            s += sq[pix[7]];
215
#else
216
#if LONG_MAX > 2147483647
217
            register uint64_t x=*(uint64_t*)pix;
218
            s += sq[x&0xff];
219
            s += sq[(x>>8)&0xff];
220
            s += sq[(x>>16)&0xff];
221
            s += sq[(x>>24)&0xff];
222
            s += sq[(x>>32)&0xff];
223
            s += sq[(x>>40)&0xff];
224
            s += sq[(x>>48)&0xff];
225
            s += sq[(x>>56)&0xff];
226
#else
227
            register uint32_t x=*(uint32_t*)pix;
228
            s += sq[x&0xff];
229
            s += sq[(x>>8)&0xff];
230
            s += sq[(x>>16)&0xff];
231
            s += sq[(x>>24)&0xff];
232
            x=*(uint32_t*)(pix+4);
233
            s += sq[x&0xff];
234
            s += sq[(x>>8)&0xff];
235
            s += sq[(x>>16)&0xff];
236
            s += sq[(x>>24)&0xff];
237
#endif
238
#endif
239
            pix += 8;
240
        }
241
        pix += line_size - 16;
242
    }
243
    return s;
244
}
245

    
246
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
247
    int i;
248

    
249
    for(i=0; i+8<=w; i+=8){
250
        dst[i+0]= bswap_32(src[i+0]);
251
        dst[i+1]= bswap_32(src[i+1]);
252
        dst[i+2]= bswap_32(src[i+2]);
253
        dst[i+3]= bswap_32(src[i+3]);
254
        dst[i+4]= bswap_32(src[i+4]);
255
        dst[i+5]= bswap_32(src[i+5]);
256
        dst[i+6]= bswap_32(src[i+6]);
257
        dst[i+7]= bswap_32(src[i+7]);
258
    }
259
    for(;i<w; i++){
260
        dst[i+0]= bswap_32(src[i+0]);
261
    }
262
}
263

    
264
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
265
{
266
    int s, i;
267
    uint32_t *sq = ff_squareTbl + 256;
268

    
269
    s = 0;
270
    for (i = 0; i < h; i++) {
271
        s += sq[pix1[0] - pix2[0]];
272
        s += sq[pix1[1] - pix2[1]];
273
        s += sq[pix1[2] - pix2[2]];
274
        s += sq[pix1[3] - pix2[3]];
275
        pix1 += line_size;
276
        pix2 += line_size;
277
    }
278
    return s;
279
}
280

    
281
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
282
{
283
    int s, i;
284
    uint32_t *sq = ff_squareTbl + 256;
285

    
286
    s = 0;
287
    for (i = 0; i < h; i++) {
288
        s += sq[pix1[0] - pix2[0]];
289
        s += sq[pix1[1] - pix2[1]];
290
        s += sq[pix1[2] - pix2[2]];
291
        s += sq[pix1[3] - pix2[3]];
292
        s += sq[pix1[4] - pix2[4]];
293
        s += sq[pix1[5] - pix2[5]];
294
        s += sq[pix1[6] - pix2[6]];
295
        s += sq[pix1[7] - pix2[7]];
296
        pix1 += line_size;
297
        pix2 += line_size;
298
    }
299
    return s;
300
}
301

    
302
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
303
{
304
    int s, i;
305
    uint32_t *sq = ff_squareTbl + 256;
306

    
307
    s = 0;
308
    for (i = 0; i < h; i++) {
309
        s += sq[pix1[ 0] - pix2[ 0]];
310
        s += sq[pix1[ 1] - pix2[ 1]];
311
        s += sq[pix1[ 2] - pix2[ 2]];
312
        s += sq[pix1[ 3] - pix2[ 3]];
313
        s += sq[pix1[ 4] - pix2[ 4]];
314
        s += sq[pix1[ 5] - pix2[ 5]];
315
        s += sq[pix1[ 6] - pix2[ 6]];
316
        s += sq[pix1[ 7] - pix2[ 7]];
317
        s += sq[pix1[ 8] - pix2[ 8]];
318
        s += sq[pix1[ 9] - pix2[ 9]];
319
        s += sq[pix1[10] - pix2[10]];
320
        s += sq[pix1[11] - pix2[11]];
321
        s += sq[pix1[12] - pix2[12]];
322
        s += sq[pix1[13] - pix2[13]];
323
        s += sq[pix1[14] - pix2[14]];
324
        s += sq[pix1[15] - pix2[15]];
325

    
326
        pix1 += line_size;
327
        pix2 += line_size;
328
    }
329
    return s;
330
}
331

    
332
/* draw the edges of width 'w' of an image of size width, height */
333
//FIXME check that this is ok for mpeg4 interlaced
334
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
335
{
336
    uint8_t *ptr, *last_line;
337
    int i;
338

    
339
    last_line = buf + (height - 1) * wrap;
340
    for(i=0;i<w;i++) {
341
        /* top and bottom */
342
        memcpy(buf - (i + 1) * wrap, buf, width);
343
        memcpy(last_line + (i + 1) * wrap, last_line, width);
344
    }
345
    /* left and right */
346
    ptr = buf;
347
    for(i=0;i<height;i++) {
348
        memset(ptr - w, ptr[0], w);
349
        memset(ptr + width, ptr[width-1], w);
350
        ptr += wrap;
351
    }
352
    /* corners */
353
    for(i=0;i<w;i++) {
354
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
355
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
356
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
357
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
358
    }
359
}
360

    
361
/**
362
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
363
 * @param buf destination buffer
364
 * @param src source buffer
365
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
366
 * @param block_w width of block
367
 * @param block_h height of block
368
 * @param src_x x coordinate of the top left sample of the block in the source buffer
369
 * @param src_y y coordinate of the top left sample of the block in the source buffer
370
 * @param w width of the source buffer
371
 * @param h height of the source buffer
372
 */
373
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
374
                                    int src_x, int src_y, int w, int h){
375
    int x, y;
376
    int start_y, start_x, end_y, end_x;
377

    
378
    if(src_y>= h){
379
        src+= (h-1-src_y)*linesize;
380
        src_y=h-1;
381
    }else if(src_y<=-block_h){
382
        src+= (1-block_h-src_y)*linesize;
383
        src_y=1-block_h;
384
    }
385
    if(src_x>= w){
386
        src+= (w-1-src_x);
387
        src_x=w-1;
388
    }else if(src_x<=-block_w){
389
        src+= (1-block_w-src_x);
390
        src_x=1-block_w;
391
    }
392

    
393
    start_y= FFMAX(0, -src_y);
394
    start_x= FFMAX(0, -src_x);
395
    end_y= FFMIN(block_h, h-src_y);
396
    end_x= FFMIN(block_w, w-src_x);
397

    
398
    // copy existing part
399
    for(y=start_y; y<end_y; y++){
400
        for(x=start_x; x<end_x; x++){
401
            buf[x + y*linesize]= src[x + y*linesize];
402
        }
403
    }
404

    
405
    //top
406
    for(y=0; y<start_y; y++){
407
        for(x=start_x; x<end_x; x++){
408
            buf[x + y*linesize]= buf[x + start_y*linesize];
409
        }
410
    }
411

    
412
    //bottom
413
    for(y=end_y; y<block_h; y++){
414
        for(x=start_x; x<end_x; x++){
415
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
416
        }
417
    }
418

    
419
    for(y=0; y<block_h; y++){
420
       //left
421
        for(x=0; x<start_x; x++){
422
            buf[x + y*linesize]= buf[start_x + y*linesize];
423
        }
424

    
425
       //right
426
        for(x=end_x; x<block_w; x++){
427
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
428
        }
429
    }
430
}
431

    
432
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
433
{
434
    int i;
435

    
436
    /* read the pixels */
437
    for(i=0;i<8;i++) {
438
        block[0] = pixels[0];
439
        block[1] = pixels[1];
440
        block[2] = pixels[2];
441
        block[3] = pixels[3];
442
        block[4] = pixels[4];
443
        block[5] = pixels[5];
444
        block[6] = pixels[6];
445
        block[7] = pixels[7];
446
        pixels += line_size;
447
        block += 8;
448
    }
449
}
450

    
451
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
452
                          const uint8_t *s2, int stride){
453
    int i;
454

    
455
    /* read the pixels */
456
    for(i=0;i<8;i++) {
457
        block[0] = s1[0] - s2[0];
458
        block[1] = s1[1] - s2[1];
459
        block[2] = s1[2] - s2[2];
460
        block[3] = s1[3] - s2[3];
461
        block[4] = s1[4] - s2[4];
462
        block[5] = s1[5] - s2[5];
463
        block[6] = s1[6] - s2[6];
464
        block[7] = s1[7] - s2[7];
465
        s1 += stride;
466
        s2 += stride;
467
        block += 8;
468
    }
469
}
470

    
471

    
472
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
473
                                 int line_size)
474
{
475
    int i;
476
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
477

    
478
    /* read the pixels */
479
    for(i=0;i<8;i++) {
480
        pixels[0] = cm[block[0]];
481
        pixels[1] = cm[block[1]];
482
        pixels[2] = cm[block[2]];
483
        pixels[3] = cm[block[3]];
484
        pixels[4] = cm[block[4]];
485
        pixels[5] = cm[block[5]];
486
        pixels[6] = cm[block[6]];
487
        pixels[7] = cm[block[7]];
488

    
489
        pixels += line_size;
490
        block += 8;
491
    }
492
}
493

    
494
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
495
                                 int line_size)
496
{
497
    int i;
498
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
499

    
500
    /* read the pixels */
501
    for(i=0;i<4;i++) {
502
        pixels[0] = cm[block[0]];
503
        pixels[1] = cm[block[1]];
504
        pixels[2] = cm[block[2]];
505
        pixels[3] = cm[block[3]];
506

    
507
        pixels += line_size;
508
        block += 8;
509
    }
510
}
511

    
512
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
513
                                 int line_size)
514
{
515
    int i;
516
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
517

    
518
    /* read the pixels */
519
    for(i=0;i<2;i++) {
520
        pixels[0] = cm[block[0]];
521
        pixels[1] = cm[block[1]];
522

    
523
        pixels += line_size;
524
        block += 8;
525
    }
526
}
527

    
528
static void put_signed_pixels_clamped_c(const DCTELEM *block,
529
                                        uint8_t *restrict pixels,
530
                                        int line_size)
531
{
532
    int i, j;
533

    
534
    for (i = 0; i < 8; i++) {
535
        for (j = 0; j < 8; j++) {
536
            if (*block < -128)
537
                *pixels = 0;
538
            else if (*block > 127)
539
                *pixels = 255;
540
            else
541
                *pixels = (uint8_t)(*block + 128);
542
            block++;
543
            pixels++;
544
        }
545
        pixels += (line_size - 8);
546
    }
547
}
548

    
549
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
550
                                    int line_size)
551
{
552
    int i;
553

    
554
    /* read the pixels */
555
    for(i=0;i<8;i++) {
556
        pixels[0] = block[0];
557
        pixels[1] = block[1];
558
        pixels[2] = block[2];
559
        pixels[3] = block[3];
560
        pixels[4] = block[4];
561
        pixels[5] = block[5];
562
        pixels[6] = block[6];
563
        pixels[7] = block[7];
564

    
565
        pixels += line_size;
566
        block += 8;
567
    }
568
}
569

    
570
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
571
                          int line_size)
572
{
573
    int i;
574
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
575

    
576
    /* read the pixels */
577
    for(i=0;i<8;i++) {
578
        pixels[0] = cm[pixels[0] + block[0]];
579
        pixels[1] = cm[pixels[1] + block[1]];
580
        pixels[2] = cm[pixels[2] + block[2]];
581
        pixels[3] = cm[pixels[3] + block[3]];
582
        pixels[4] = cm[pixels[4] + block[4]];
583
        pixels[5] = cm[pixels[5] + block[5]];
584
        pixels[6] = cm[pixels[6] + block[6]];
585
        pixels[7] = cm[pixels[7] + block[7]];
586
        pixels += line_size;
587
        block += 8;
588
    }
589
}
590

    
591
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
592
                          int line_size)
593
{
594
    int i;
595
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
596

    
597
    /* read the pixels */
598
    for(i=0;i<4;i++) {
599
        pixels[0] = cm[pixels[0] + block[0]];
600
        pixels[1] = cm[pixels[1] + block[1]];
601
        pixels[2] = cm[pixels[2] + block[2]];
602
        pixels[3] = cm[pixels[3] + block[3]];
603
        pixels += line_size;
604
        block += 8;
605
    }
606
}
607

    
608
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
609
                          int line_size)
610
{
611
    int i;
612
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
613

    
614
    /* read the pixels */
615
    for(i=0;i<2;i++) {
616
        pixels[0] = cm[pixels[0] + block[0]];
617
        pixels[1] = cm[pixels[1] + block[1]];
618
        pixels += line_size;
619
        block += 8;
620
    }
621
}
622

    
623
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
624
{
625
    int i;
626
    for(i=0;i<8;i++) {
627
        pixels[0] += block[0];
628
        pixels[1] += block[1];
629
        pixels[2] += block[2];
630
        pixels[3] += block[3];
631
        pixels[4] += block[4];
632
        pixels[5] += block[5];
633
        pixels[6] += block[6];
634
        pixels[7] += block[7];
635
        pixels += line_size;
636
        block += 8;
637
    }
638
}
639

    
640
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
641
{
642
    int i;
643
    for(i=0;i<4;i++) {
644
        pixels[0] += block[0];
645
        pixels[1] += block[1];
646
        pixels[2] += block[2];
647
        pixels[3] += block[3];
648
        pixels += line_size;
649
        block += 4;
650
    }
651
}
652

    
653
static int sum_abs_dctelem_c(DCTELEM *block)
654
{
655
    int sum=0, i;
656
    for(i=0; i<64; i++)
657
        sum+= FFABS(block[i]);
658
    return sum;
659
}
660

    
661
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
662
{
663
    int i;
664

    
665
    for (i = 0; i < h; i++) {
666
        memset(block, value, 16);
667
        block += line_size;
668
    }
669
}
670

    
671
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
672
{
673
    int i;
674

    
675
    for (i = 0; i < h; i++) {
676
        memset(block, value, 8);
677
        block += line_size;
678
    }
679
}
680

    
681
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
682
{
683
    int i, j;
684
    uint16_t *dst1 = (uint16_t *) dst;
685
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
686

    
687
    for (j = 0; j < 8; j++) {
688
        for (i = 0; i < 8; i++) {
689
            dst1[i] = dst2[i] = src[i] * 0x0101;
690
        }
691
        src  += 8;
692
        dst1 += linesize;
693
        dst2 += linesize;
694
    }
695
}
696

    
697
#if 0
698

699
#define PIXOP2(OPNAME, OP) \
700
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
701
{\
702
    int i;\
703
    for(i=0; i<h; i++){\
704
        OP(*((uint64_t*)block), AV_RN64(pixels));\
705
        pixels+=line_size;\
706
        block +=line_size;\
707
    }\
708
}\
709
\
710
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
711
{\
712
    int i;\
713
    for(i=0; i<h; i++){\
714
        const uint64_t a= AV_RN64(pixels  );\
715
        const uint64_t b= AV_RN64(pixels+1);\
716
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
717
        pixels+=line_size;\
718
        block +=line_size;\
719
    }\
720
}\
721
\
722
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
723
{\
724
    int i;\
725
    for(i=0; i<h; i++){\
726
        const uint64_t a= AV_RN64(pixels  );\
727
        const uint64_t b= AV_RN64(pixels+1);\
728
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
729
        pixels+=line_size;\
730
        block +=line_size;\
731
    }\
732
}\
733
\
734
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
735
{\
736
    int i;\
737
    for(i=0; i<h; i++){\
738
        const uint64_t a= AV_RN64(pixels          );\
739
        const uint64_t b= AV_RN64(pixels+line_size);\
740
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
741
        pixels+=line_size;\
742
        block +=line_size;\
743
    }\
744
}\
745
\
746
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
747
{\
748
    int i;\
749
    for(i=0; i<h; i++){\
750
        const uint64_t a= AV_RN64(pixels          );\
751
        const uint64_t b= AV_RN64(pixels+line_size);\
752
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
753
        pixels+=line_size;\
754
        block +=line_size;\
755
    }\
756
}\
757
\
758
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
759
{\
760
        int i;\
761
        const uint64_t a= AV_RN64(pixels  );\
762
        const uint64_t b= AV_RN64(pixels+1);\
763
        uint64_t l0=  (a&0x0303030303030303ULL)\
764
                    + (b&0x0303030303030303ULL)\
765
                    + 0x0202020202020202ULL;\
766
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
767
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
768
        uint64_t l1,h1;\
769
\
770
        pixels+=line_size;\
771
        for(i=0; i<h; i+=2){\
772
            uint64_t a= AV_RN64(pixels  );\
773
            uint64_t b= AV_RN64(pixels+1);\
774
            l1=  (a&0x0303030303030303ULL)\
775
               + (b&0x0303030303030303ULL);\
776
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
777
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
778
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
779
            pixels+=line_size;\
780
            block +=line_size;\
781
            a= AV_RN64(pixels  );\
782
            b= AV_RN64(pixels+1);\
783
            l0=  (a&0x0303030303030303ULL)\
784
               + (b&0x0303030303030303ULL)\
785
               + 0x0202020202020202ULL;\
786
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
787
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
788
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
789
            pixels+=line_size;\
790
            block +=line_size;\
791
        }\
792
}\
793
\
794
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
795
{\
796
        int i;\
797
        const uint64_t a= AV_RN64(pixels  );\
798
        const uint64_t b= AV_RN64(pixels+1);\
799
        uint64_t l0=  (a&0x0303030303030303ULL)\
800
                    + (b&0x0303030303030303ULL)\
801
                    + 0x0101010101010101ULL;\
802
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
803
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
804
        uint64_t l1,h1;\
805
\
806
        pixels+=line_size;\
807
        for(i=0; i<h; i+=2){\
808
            uint64_t a= AV_RN64(pixels  );\
809
            uint64_t b= AV_RN64(pixels+1);\
810
            l1=  (a&0x0303030303030303ULL)\
811
               + (b&0x0303030303030303ULL);\
812
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
813
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
814
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
815
            pixels+=line_size;\
816
            block +=line_size;\
817
            a= AV_RN64(pixels  );\
818
            b= AV_RN64(pixels+1);\
819
            l0=  (a&0x0303030303030303ULL)\
820
               + (b&0x0303030303030303ULL)\
821
               + 0x0101010101010101ULL;\
822
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
823
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
824
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
825
            pixels+=line_size;\
826
            block +=line_size;\
827
        }\
828
}\
829
\
830
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
831
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
832
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
833
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
834
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
835
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
836
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
837

838
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
839
#else // 64 bit variant
840

    
841
#define PIXOP2(OPNAME, OP) \
842
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
843
    int i;\
844
    for(i=0; i<h; i++){\
845
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
846
        pixels+=line_size;\
847
        block +=line_size;\
848
    }\
849
}\
850
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851
    int i;\
852
    for(i=0; i<h; i++){\
853
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
854
        pixels+=line_size;\
855
        block +=line_size;\
856
    }\
857
}\
858
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859
    int i;\
860
    for(i=0; i<h; i++){\
861
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
862
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
863
        pixels+=line_size;\
864
        block +=line_size;\
865
    }\
866
}\
867
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
868
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
869
}\
870
\
871
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
872
                                                int src_stride1, int src_stride2, int h){\
873
    int i;\
874
    for(i=0; i<h; i++){\
875
        uint32_t a,b;\
876
        a= AV_RN32(&src1[i*src_stride1  ]);\
877
        b= AV_RN32(&src2[i*src_stride2  ]);\
878
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
879
        a= AV_RN32(&src1[i*src_stride1+4]);\
880
        b= AV_RN32(&src2[i*src_stride2+4]);\
881
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
882
    }\
883
}\
884
\
885
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
886
                                                int src_stride1, int src_stride2, int h){\
887
    int i;\
888
    for(i=0; i<h; i++){\
889
        uint32_t a,b;\
890
        a= AV_RN32(&src1[i*src_stride1  ]);\
891
        b= AV_RN32(&src2[i*src_stride2  ]);\
892
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
893
        a= AV_RN32(&src1[i*src_stride1+4]);\
894
        b= AV_RN32(&src2[i*src_stride2+4]);\
895
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
896
    }\
897
}\
898
\
899
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
900
                                                int src_stride1, int src_stride2, int h){\
901
    int i;\
902
    for(i=0; i<h; i++){\
903
        uint32_t a,b;\
904
        a= AV_RN32(&src1[i*src_stride1  ]);\
905
        b= AV_RN32(&src2[i*src_stride2  ]);\
906
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
907
    }\
908
}\
909
\
910
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
911
                                                int src_stride1, int src_stride2, int h){\
912
    int i;\
913
    for(i=0; i<h; i++){\
914
        uint32_t a,b;\
915
        a= AV_RN16(&src1[i*src_stride1  ]);\
916
        b= AV_RN16(&src2[i*src_stride2  ]);\
917
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
918
    }\
919
}\
920
\
921
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922
                                                int src_stride1, int src_stride2, int h){\
923
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
924
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
925
}\
926
\
927
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
928
                                                int src_stride1, int src_stride2, int h){\
929
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
930
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
931
}\
932
\
933
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
934
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
935
}\
936
\
937
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
938
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
939
}\
940
\
941
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
942
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
943
}\
944
\
945
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
946
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
947
}\
948
\
949
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
950
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
951
    int i;\
952
    for(i=0; i<h; i++){\
953
        uint32_t a, b, c, d, l0, l1, h0, h1;\
954
        a= AV_RN32(&src1[i*src_stride1]);\
955
        b= AV_RN32(&src2[i*src_stride2]);\
956
        c= AV_RN32(&src3[i*src_stride3]);\
957
        d= AV_RN32(&src4[i*src_stride4]);\
958
        l0=  (a&0x03030303UL)\
959
           + (b&0x03030303UL)\
960
           + 0x02020202UL;\
961
        h0= ((a&0xFCFCFCFCUL)>>2)\
962
          + ((b&0xFCFCFCFCUL)>>2);\
963
        l1=  (c&0x03030303UL)\
964
           + (d&0x03030303UL);\
965
        h1= ((c&0xFCFCFCFCUL)>>2)\
966
          + ((d&0xFCFCFCFCUL)>>2);\
967
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
968
        a= AV_RN32(&src1[i*src_stride1+4]);\
969
        b= AV_RN32(&src2[i*src_stride2+4]);\
970
        c= AV_RN32(&src3[i*src_stride3+4]);\
971
        d= AV_RN32(&src4[i*src_stride4+4]);\
972
        l0=  (a&0x03030303UL)\
973
           + (b&0x03030303UL)\
974
           + 0x02020202UL;\
975
        h0= ((a&0xFCFCFCFCUL)>>2)\
976
          + ((b&0xFCFCFCFCUL)>>2);\
977
        l1=  (c&0x03030303UL)\
978
           + (d&0x03030303UL);\
979
        h1= ((c&0xFCFCFCFCUL)>>2)\
980
          + ((d&0xFCFCFCFCUL)>>2);\
981
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
982
    }\
983
}\
984
\
985
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
986
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987
}\
988
\
989
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
990
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
991
}\
992
\
993
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
994
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
995
}\
996
\
997
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
998
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
999
}\
1000
\
1001
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1002
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1003
    int i;\
1004
    for(i=0; i<h; i++){\
1005
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1006
        a= AV_RN32(&src1[i*src_stride1]);\
1007
        b= AV_RN32(&src2[i*src_stride2]);\
1008
        c= AV_RN32(&src3[i*src_stride3]);\
1009
        d= AV_RN32(&src4[i*src_stride4]);\
1010
        l0=  (a&0x03030303UL)\
1011
           + (b&0x03030303UL)\
1012
           + 0x01010101UL;\
1013
        h0= ((a&0xFCFCFCFCUL)>>2)\
1014
          + ((b&0xFCFCFCFCUL)>>2);\
1015
        l1=  (c&0x03030303UL)\
1016
           + (d&0x03030303UL);\
1017
        h1= ((c&0xFCFCFCFCUL)>>2)\
1018
          + ((d&0xFCFCFCFCUL)>>2);\
1019
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1020
        a= AV_RN32(&src1[i*src_stride1+4]);\
1021
        b= AV_RN32(&src2[i*src_stride2+4]);\
1022
        c= AV_RN32(&src3[i*src_stride3+4]);\
1023
        d= AV_RN32(&src4[i*src_stride4+4]);\
1024
        l0=  (a&0x03030303UL)\
1025
           + (b&0x03030303UL)\
1026
           + 0x01010101UL;\
1027
        h0= ((a&0xFCFCFCFCUL)>>2)\
1028
          + ((b&0xFCFCFCFCUL)>>2);\
1029
        l1=  (c&0x03030303UL)\
1030
           + (d&0x03030303UL);\
1031
        h1= ((c&0xFCFCFCFCUL)>>2)\
1032
          + ((d&0xFCFCFCFCUL)>>2);\
1033
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034
    }\
1035
}\
1036
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1037
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1038
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1040
}\
1041
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1042
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1043
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1045
}\
1046
\
1047
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1048
{\
1049
        int i, a0, b0, a1, b1;\
1050
        a0= pixels[0];\
1051
        b0= pixels[1] + 2;\
1052
        a0 += b0;\
1053
        b0 += pixels[2];\
1054
\
1055
        pixels+=line_size;\
1056
        for(i=0; i<h; i+=2){\
1057
            a1= pixels[0];\
1058
            b1= pixels[1];\
1059
            a1 += b1;\
1060
            b1 += pixels[2];\
1061
\
1062
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1063
            block[1]= (b1+b0)>>2;\
1064
\
1065
            pixels+=line_size;\
1066
            block +=line_size;\
1067
\
1068
            a0= pixels[0];\
1069
            b0= pixels[1] + 2;\
1070
            a0 += b0;\
1071
            b0 += pixels[2];\
1072
\
1073
            block[0]= (a1+a0)>>2;\
1074
            block[1]= (b1+b0)>>2;\
1075
            pixels+=line_size;\
1076
            block +=line_size;\
1077
        }\
1078
}\
1079
\
1080
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1081
{\
1082
        int i;\
1083
        const uint32_t a= AV_RN32(pixels  );\
1084
        const uint32_t b= AV_RN32(pixels+1);\
1085
        uint32_t l0=  (a&0x03030303UL)\
1086
                    + (b&0x03030303UL)\
1087
                    + 0x02020202UL;\
1088
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089
                   + ((b&0xFCFCFCFCUL)>>2);\
1090
        uint32_t l1,h1;\
1091
\
1092
        pixels+=line_size;\
1093
        for(i=0; i<h; i+=2){\
1094
            uint32_t a= AV_RN32(pixels  );\
1095
            uint32_t b= AV_RN32(pixels+1);\
1096
            l1=  (a&0x03030303UL)\
1097
               + (b&0x03030303UL);\
1098
            h1= ((a&0xFCFCFCFCUL)>>2)\
1099
              + ((b&0xFCFCFCFCUL)>>2);\
1100
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101
            pixels+=line_size;\
1102
            block +=line_size;\
1103
            a= AV_RN32(pixels  );\
1104
            b= AV_RN32(pixels+1);\
1105
            l0=  (a&0x03030303UL)\
1106
               + (b&0x03030303UL)\
1107
               + 0x02020202UL;\
1108
            h0= ((a&0xFCFCFCFCUL)>>2)\
1109
              + ((b&0xFCFCFCFCUL)>>2);\
1110
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111
            pixels+=line_size;\
1112
            block +=line_size;\
1113
        }\
1114
}\
1115
\
1116
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1117
{\
1118
    int j;\
1119
    for(j=0; j<2; j++){\
1120
        int i;\
1121
        const uint32_t a= AV_RN32(pixels  );\
1122
        const uint32_t b= AV_RN32(pixels+1);\
1123
        uint32_t l0=  (a&0x03030303UL)\
1124
                    + (b&0x03030303UL)\
1125
                    + 0x02020202UL;\
1126
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1127
                   + ((b&0xFCFCFCFCUL)>>2);\
1128
        uint32_t l1,h1;\
1129
\
1130
        pixels+=line_size;\
1131
        for(i=0; i<h; i+=2){\
1132
            uint32_t a= AV_RN32(pixels  );\
1133
            uint32_t b= AV_RN32(pixels+1);\
1134
            l1=  (a&0x03030303UL)\
1135
               + (b&0x03030303UL);\
1136
            h1= ((a&0xFCFCFCFCUL)>>2)\
1137
              + ((b&0xFCFCFCFCUL)>>2);\
1138
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1139
            pixels+=line_size;\
1140
            block +=line_size;\
1141
            a= AV_RN32(pixels  );\
1142
            b= AV_RN32(pixels+1);\
1143
            l0=  (a&0x03030303UL)\
1144
               + (b&0x03030303UL)\
1145
               + 0x02020202UL;\
1146
            h0= ((a&0xFCFCFCFCUL)>>2)\
1147
              + ((b&0xFCFCFCFCUL)>>2);\
1148
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1149
            pixels+=line_size;\
1150
            block +=line_size;\
1151
        }\
1152
        pixels+=4-line_size*(h+1);\
1153
        block +=4-line_size*h;\
1154
    }\
1155
}\
1156
\
1157
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1158
{\
1159
    int j;\
1160
    for(j=0; j<2; j++){\
1161
        int i;\
1162
        const uint32_t a= AV_RN32(pixels  );\
1163
        const uint32_t b= AV_RN32(pixels+1);\
1164
        uint32_t l0=  (a&0x03030303UL)\
1165
                    + (b&0x03030303UL)\
1166
                    + 0x01010101UL;\
1167
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1168
                   + ((b&0xFCFCFCFCUL)>>2);\
1169
        uint32_t l1,h1;\
1170
\
1171
        pixels+=line_size;\
1172
        for(i=0; i<h; i+=2){\
1173
            uint32_t a= AV_RN32(pixels  );\
1174
            uint32_t b= AV_RN32(pixels+1);\
1175
            l1=  (a&0x03030303UL)\
1176
               + (b&0x03030303UL);\
1177
            h1= ((a&0xFCFCFCFCUL)>>2)\
1178
              + ((b&0xFCFCFCFCUL)>>2);\
1179
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1180
            pixels+=line_size;\
1181
            block +=line_size;\
1182
            a= AV_RN32(pixels  );\
1183
            b= AV_RN32(pixels+1);\
1184
            l0=  (a&0x03030303UL)\
1185
               + (b&0x03030303UL)\
1186
               + 0x01010101UL;\
1187
            h0= ((a&0xFCFCFCFCUL)>>2)\
1188
              + ((b&0xFCFCFCFCUL)>>2);\
1189
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1190
            pixels+=line_size;\
1191
            block +=line_size;\
1192
        }\
1193
        pixels+=4-line_size*(h+1);\
1194
        block +=4-line_size*h;\
1195
    }\
1196
}\
1197
\
1198
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1199
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1200
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1201
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1202
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1203
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1204
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1205
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1206

    
1207
#define op_avg(a, b) a = rnd_avg32(a, b)
1208
#endif
1209
#define op_put(a, b) a = b
1210

    
1211
PIXOP2(avg, op_avg)
1212
PIXOP2(put, op_put)
1213
#undef op_avg
1214
#undef op_put
1215

    
1216
#define avg2(a,b) ((a+b+1)>>1)
1217
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1218

    
1219
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1220
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1221
}
1222

    
1223
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1224
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1225
}
1226

    
1227
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1228
{
1229
    const int A=(16-x16)*(16-y16);
1230
    const int B=(   x16)*(16-y16);
1231
    const int C=(16-x16)*(   y16);
1232
    const int D=(   x16)*(   y16);
1233
    int i;
1234

    
1235
    for(i=0; i<h; i++)
1236
    {
1237
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1238
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1239
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1240
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1241
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1242
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1243
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1244
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1245
        dst+= stride;
1246
        src+= stride;
1247
    }
1248
}
1249

    
1250
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1251
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1252
{
1253
    int y, vx, vy;
1254
    const int s= 1<<shift;
1255

    
1256
    width--;
1257
    height--;
1258

    
1259
    for(y=0; y<h; y++){
1260
        int x;
1261

    
1262
        vx= ox;
1263
        vy= oy;
1264
        for(x=0; x<8; x++){ //XXX FIXME optimize
1265
            int src_x, src_y, frac_x, frac_y, index;
1266

    
1267
            src_x= vx>>16;
1268
            src_y= vy>>16;
1269
            frac_x= src_x&(s-1);
1270
            frac_y= src_y&(s-1);
1271
            src_x>>=shift;
1272
            src_y>>=shift;
1273

    
1274
            if((unsigned)src_x < width){
1275
                if((unsigned)src_y < height){
1276
                    index= src_x + src_y*stride;
1277
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1278
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1279
                                        + (  src[index+stride  ]*(s-frac_x)
1280
                                           + src[index+stride+1]*   frac_x )*   frac_y
1281
                                        + r)>>(shift*2);
1282
                }else{
1283
                    index= src_x + av_clip(src_y, 0, height)*stride;
1284
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1285
                                          + src[index       +1]*   frac_x )*s
1286
                                        + r)>>(shift*2);
1287
                }
1288
            }else{
1289
                if((unsigned)src_y < height){
1290
                    index= av_clip(src_x, 0, width) + src_y*stride;
1291
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1292
                                           + src[index+stride  ]*   frac_y )*s
1293
                                        + r)>>(shift*2);
1294
                }else{
1295
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1296
                    dst[y*stride + x]=    src[index         ];
1297
                }
1298
            }
1299

    
1300
            vx+= dxx;
1301
            vy+= dyx;
1302
        }
1303
        ox += dxy;
1304
        oy += dyy;
1305
    }
1306
}
1307

    
1308
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309
    switch(width){
1310
    case 2: put_pixels2_c (dst, src, stride, height); break;
1311
    case 4: put_pixels4_c (dst, src, stride, height); break;
1312
    case 8: put_pixels8_c (dst, src, stride, height); break;
1313
    case 16:put_pixels16_c(dst, src, stride, height); break;
1314
    }
1315
}
1316

    
1317
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318
    int i,j;
1319
    for (i=0; i < height; i++) {
1320
      for (j=0; j < width; j++) {
1321
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1322
      }
1323
      src += stride;
1324
      dst += stride;
1325
    }
1326
}
1327

    
1328
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329
    int i,j;
1330
    for (i=0; i < height; i++) {
1331
      for (j=0; j < width; j++) {
1332
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1333
      }
1334
      src += stride;
1335
      dst += stride;
1336
    }
1337
}
1338

    
1339
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340
    int i,j;
1341
    for (i=0; i < height; i++) {
1342
      for (j=0; j < width; j++) {
1343
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1344
      }
1345
      src += stride;
1346
      dst += stride;
1347
    }
1348
}
1349

    
1350
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351
    int i,j;
1352
    for (i=0; i < height; i++) {
1353
      for (j=0; j < width; j++) {
1354
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1355
      }
1356
      src += stride;
1357
      dst += stride;
1358
    }
1359
}
1360

    
1361
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362
    int i,j;
1363
    for (i=0; i < height; i++) {
1364
      for (j=0; j < width; j++) {
1365
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1366
      }
1367
      src += stride;
1368
      dst += stride;
1369
    }
1370
}
1371

    
1372
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373
    int i,j;
1374
    for (i=0; i < height; i++) {
1375
      for (j=0; j < width; j++) {
1376
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1377
      }
1378
      src += stride;
1379
      dst += stride;
1380
    }
1381
}
1382

    
1383
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1384
    int i,j;
1385
    for (i=0; i < height; i++) {
1386
      for (j=0; j < width; j++) {
1387
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1388
      }
1389
      src += stride;
1390
      dst += stride;
1391
    }
1392
}
1393

    
1394
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1395
    int i,j;
1396
    for (i=0; i < height; i++) {
1397
      for (j=0; j < width; j++) {
1398
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1399
      }
1400
      src += stride;
1401
      dst += stride;
1402
    }
1403
}
1404

    
1405
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1406
    switch(width){
1407
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1408
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1409
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1410
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1411
    }
1412
}
1413

    
1414
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415
    int i,j;
1416
    for (i=0; i < height; i++) {
1417
      for (j=0; j < width; j++) {
1418
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1419
      }
1420
      src += stride;
1421
      dst += stride;
1422
    }
1423
}
1424

    
1425
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426
    int i,j;
1427
    for (i=0; i < height; i++) {
1428
      for (j=0; j < width; j++) {
1429
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1430
      }
1431
      src += stride;
1432
      dst += stride;
1433
    }
1434
}
1435

    
1436
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437
    int i,j;
1438
    for (i=0; i < height; i++) {
1439
      for (j=0; j < width; j++) {
1440
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1441
      }
1442
      src += stride;
1443
      dst += stride;
1444
    }
1445
}
1446

    
1447
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448
    int i,j;
1449
    for (i=0; i < height; i++) {
1450
      for (j=0; j < width; j++) {
1451
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1452
      }
1453
      src += stride;
1454
      dst += stride;
1455
    }
1456
}
1457

    
1458
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459
    int i,j;
1460
    for (i=0; i < height; i++) {
1461
      for (j=0; j < width; j++) {
1462
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1463
      }
1464
      src += stride;
1465
      dst += stride;
1466
    }
1467
}
1468

    
1469
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1470
    int i,j;
1471
    for (i=0; i < height; i++) {
1472
      for (j=0; j < width; j++) {
1473
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1474
      }
1475
      src += stride;
1476
      dst += stride;
1477
    }
1478
}
1479

    
1480
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1481
    int i,j;
1482
    for (i=0; i < height; i++) {
1483
      for (j=0; j < width; j++) {
1484
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1485
      }
1486
      src += stride;
1487
      dst += stride;
1488
    }
1489
}
1490

    
1491
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1492
    int i,j;
1493
    for (i=0; i < height; i++) {
1494
      for (j=0; j < width; j++) {
1495
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1496
      }
1497
      src += stride;
1498
      dst += stride;
1499
    }
1500
}
1501
#if 0
1502
#define TPEL_WIDTH(width)\
1503
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1504
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1505
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1506
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1507
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1508
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1509
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1510
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1511
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1512
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1513
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1514
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1515
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1516
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1517
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1518
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1519
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1520
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1521
#endif
1522

    
1523
#define H264_CHROMA_MC(OPNAME, OP)\
1524
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1525
    const int A=(8-x)*(8-y);\
1526
    const int B=(  x)*(8-y);\
1527
    const int C=(8-x)*(  y);\
1528
    const int D=(  x)*(  y);\
1529
    int i;\
1530
    \
1531
    assert(x<8 && y<8 && x>=0 && y>=0);\
1532
\
1533
    if(D){\
1534
        for(i=0; i<h; i++){\
1535
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1536
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1537
            dst+= stride;\
1538
            src+= stride;\
1539
        }\
1540
    }else{\
1541
        const int E= B+C;\
1542
        const int step= C ? stride : 1;\
1543
        for(i=0; i<h; i++){\
1544
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1545
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1546
            dst+= stride;\
1547
            src+= stride;\
1548
        }\
1549
    }\
1550
}\
1551
\
1552
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1553
    const int A=(8-x)*(8-y);\
1554
    const int B=(  x)*(8-y);\
1555
    const int C=(8-x)*(  y);\
1556
    const int D=(  x)*(  y);\
1557
    int i;\
1558
    \
1559
    assert(x<8 && y<8 && x>=0 && y>=0);\
1560
\
1561
    if(D){\
1562
        for(i=0; i<h; i++){\
1563
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1564
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1565
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1566
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1567
            dst+= stride;\
1568
            src+= stride;\
1569
        }\
1570
    }else{\
1571
        const int E= B+C;\
1572
        const int step= C ? stride : 1;\
1573
        for(i=0; i<h; i++){\
1574
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1575
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1576
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1577
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1578
            dst+= stride;\
1579
            src+= stride;\
1580
        }\
1581
    }\
1582
}\
1583
\
1584
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1585
    const int A=(8-x)*(8-y);\
1586
    const int B=(  x)*(8-y);\
1587
    const int C=(8-x)*(  y);\
1588
    const int D=(  x)*(  y);\
1589
    int i;\
1590
    \
1591
    assert(x<8 && y<8 && x>=0 && y>=0);\
1592
\
1593
    if(D){\
1594
        for(i=0; i<h; i++){\
1595
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1596
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1597
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1598
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1599
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1600
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1601
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1602
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1603
            dst+= stride;\
1604
            src+= stride;\
1605
        }\
1606
    }else{\
1607
        const int E= B+C;\
1608
        const int step= C ? stride : 1;\
1609
        for(i=0; i<h; i++){\
1610
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1611
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1612
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1613
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1614
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1615
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1616
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1617
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1618
            dst+= stride;\
1619
            src+= stride;\
1620
        }\
1621
    }\
1622
}
1623

    
1624
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1625
#define op_put(a, b) a = (((b) + 32)>>6)
1626

    
1627
H264_CHROMA_MC(put_       , op_put)
1628
H264_CHROMA_MC(avg_       , op_avg)
1629
#undef op_avg
1630
#undef op_put
1631

    
1632
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1633
    const int A=(8-x)*(8-y);
1634
    const int B=(  x)*(8-y);
1635
    const int C=(8-x)*(  y);
1636
    const int D=(  x)*(  y);
1637
    int i;
1638

    
1639
    assert(x<8 && y<8 && x>=0 && y>=0);
1640

    
1641
    for(i=0; i<h; i++)
1642
    {
1643
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1644
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1645
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1646
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1647
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1648
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1649
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1650
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1651
        dst+= stride;
1652
        src+= stride;
1653
    }
1654
}
1655

    
1656
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1657
    const int A=(8-x)*(8-y);
1658
    const int B=(  x)*(8-y);
1659
    const int C=(8-x)*(  y);
1660
    const int D=(  x)*(  y);
1661
    int i;
1662

    
1663
    assert(x<8 && y<8 && x>=0 && y>=0);
1664

    
1665
    for(i=0; i<h; i++)
1666
    {
1667
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1668
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1669
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1670
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1671
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1672
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1673
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1674
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1675
        dst+= stride;
1676
        src+= stride;
1677
    }
1678
}
1679

    
1680
#define QPEL_MC(r, OPNAME, RND, OP) \
1681
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1682
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1683
    int i;\
1684
    for(i=0; i<h; i++)\
1685
    {\
1686
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1687
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1688
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1689
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1690
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1691
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1692
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1693
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1694
        dst+=dstStride;\
1695
        src+=srcStride;\
1696
    }\
1697
}\
1698
\
1699
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1700
    const int w=8;\
1701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1702
    int i;\
1703
    for(i=0; i<w; i++)\
1704
    {\
1705
        const int src0= src[0*srcStride];\
1706
        const int src1= src[1*srcStride];\
1707
        const int src2= src[2*srcStride];\
1708
        const int src3= src[3*srcStride];\
1709
        const int src4= src[4*srcStride];\
1710
        const int src5= src[5*srcStride];\
1711
        const int src6= src[6*srcStride];\
1712
        const int src7= src[7*srcStride];\
1713
        const int src8= src[8*srcStride];\
1714
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1715
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1716
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1717
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1718
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1719
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1720
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1721
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1722
        dst++;\
1723
        src++;\
1724
    }\
1725
}\
1726
\
1727
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1728
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1729
    int i;\
1730
    \
1731
    for(i=0; i<h; i++)\
1732
    {\
1733
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1734
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1735
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1736
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1737
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1738
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1739
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1740
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1741
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1742
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1743
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1744
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1745
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1746
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1747
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1748
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1749
        dst+=dstStride;\
1750
        src+=srcStride;\
1751
    }\
1752
}\
1753
\
1754
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1755
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1756
    int i;\
1757
    const int w=16;\
1758
    for(i=0; i<w; i++)\
1759
    {\
1760
        const int src0= src[0*srcStride];\
1761
        const int src1= src[1*srcStride];\
1762
        const int src2= src[2*srcStride];\
1763
        const int src3= src[3*srcStride];\
1764
        const int src4= src[4*srcStride];\
1765
        const int src5= src[5*srcStride];\
1766
        const int src6= src[6*srcStride];\
1767
        const int src7= src[7*srcStride];\
1768
        const int src8= src[8*srcStride];\
1769
        const int src9= src[9*srcStride];\
1770
        const int src10= src[10*srcStride];\
1771
        const int src11= src[11*srcStride];\
1772
        const int src12= src[12*srcStride];\
1773
        const int src13= src[13*srcStride];\
1774
        const int src14= src[14*srcStride];\
1775
        const int src15= src[15*srcStride];\
1776
        const int src16= src[16*srcStride];\
1777
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1778
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1779
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1780
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1781
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1782
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1783
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1784
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1785
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1786
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1787
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1788
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1789
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1790
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1791
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1792
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1793
        dst++;\
1794
        src++;\
1795
    }\
1796
}\
1797
\
1798
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1799
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1800
}\
1801
\
1802
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1803
    uint8_t half[64];\
1804
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1805
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1806
}\
1807
\
1808
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1809
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1810
}\
1811
\
1812
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1813
    uint8_t half[64];\
1814
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1815
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1816
}\
1817
\
1818
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1819
    uint8_t full[16*9];\
1820
    uint8_t half[64];\
1821
    copy_block9(full, src, 16, stride, 9);\
1822
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1823
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1824
}\
1825
\
1826
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1827
    uint8_t full[16*9];\
1828
    copy_block9(full, src, 16, stride, 9);\
1829
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1830
}\
1831
\
1832
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1833
    uint8_t full[16*9];\
1834
    uint8_t half[64];\
1835
    copy_block9(full, src, 16, stride, 9);\
1836
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1837
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1838
}\
1839
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1840
    uint8_t full[16*9];\
1841
    uint8_t halfH[72];\
1842
    uint8_t halfV[64];\
1843
    uint8_t halfHV[64];\
1844
    copy_block9(full, src, 16, stride, 9);\
1845
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1846
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1847
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1848
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1849
}\
1850
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1851
    uint8_t full[16*9];\
1852
    uint8_t halfH[72];\
1853
    uint8_t halfHV[64];\
1854
    copy_block9(full, src, 16, stride, 9);\
1855
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1857
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1859
}\
1860
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    uint8_t full[16*9];\
1862
    uint8_t halfH[72];\
1863
    uint8_t halfV[64];\
1864
    uint8_t halfHV[64];\
1865
    copy_block9(full, src, 16, stride, 9);\
1866
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1867
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1868
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1869
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1870
}\
1871
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1872
    uint8_t full[16*9];\
1873
    uint8_t halfH[72];\
1874
    uint8_t halfHV[64];\
1875
    copy_block9(full, src, 16, stride, 9);\
1876
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1877
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1878
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1879
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1880
}\
1881
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882
    uint8_t full[16*9];\
1883
    uint8_t halfH[72];\
1884
    uint8_t halfV[64];\
1885
    uint8_t halfHV[64];\
1886
    copy_block9(full, src, 16, stride, 9);\
1887
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1888
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1891
}\
1892
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[16*9];\
1894
    uint8_t halfH[72];\
1895
    uint8_t halfHV[64];\
1896
    copy_block9(full, src, 16, stride, 9);\
1897
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1898
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1899
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1900
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1901
}\
1902
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903
    uint8_t full[16*9];\
1904
    uint8_t halfH[72];\
1905
    uint8_t halfV[64];\
1906
    uint8_t halfHV[64];\
1907
    copy_block9(full, src, 16, stride, 9);\
1908
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1909
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1910
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1912
}\
1913
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[16*9];\
1915
    uint8_t halfH[72];\
1916
    uint8_t halfHV[64];\
1917
    copy_block9(full, src, 16, stride, 9);\
1918
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1919
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1920
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1921
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1922
}\
1923
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1924
    uint8_t halfH[72];\
1925
    uint8_t halfHV[64];\
1926
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1927
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1928
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1929
}\
1930
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1931
    uint8_t halfH[72];\
1932
    uint8_t halfHV[64];\
1933
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1934
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1935
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1936
}\
1937
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1938
    uint8_t full[16*9];\
1939
    uint8_t halfH[72];\
1940
    uint8_t halfV[64];\
1941
    uint8_t halfHV[64];\
1942
    copy_block9(full, src, 16, stride, 9);\
1943
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1945
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1947
}\
1948
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t full[16*9];\
1950
    uint8_t halfH[72];\
1951
    copy_block9(full, src, 16, stride, 9);\
1952
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1953
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1954
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1955
}\
1956
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t full[16*9];\
1958
    uint8_t halfH[72];\
1959
    uint8_t halfV[64];\
1960
    uint8_t halfHV[64];\
1961
    copy_block9(full, src, 16, stride, 9);\
1962
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1964
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1966
}\
1967
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1968
    uint8_t full[16*9];\
1969
    uint8_t halfH[72];\
1970
    copy_block9(full, src, 16, stride, 9);\
1971
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1974
}\
1975
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t halfH[72];\
1977
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1978
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1979
}\
1980
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1981
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1982
}\
1983
\
1984
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1985
    uint8_t half[256];\
1986
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1987
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1988
}\
1989
\
1990
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1991
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1992
}\
1993
\
1994
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1995
    uint8_t half[256];\
1996
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1997
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1998
}\
1999
\
2000
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t full[24*17];\
2002
    uint8_t half[256];\
2003
    copy_block17(full, src, 24, stride, 17);\
2004
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2005
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2006
}\
2007
\
2008
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2009
    uint8_t full[24*17];\
2010
    copy_block17(full, src, 24, stride, 17);\
2011
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2012
}\
2013
\
2014
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2015
    uint8_t full[24*17];\
2016
    uint8_t half[256];\
2017
    copy_block17(full, src, 24, stride, 17);\
2018
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2019
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2020
}\
2021
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2022
    uint8_t full[24*17];\
2023
    uint8_t halfH[272];\
2024
    uint8_t halfV[256];\
2025
    uint8_t halfHV[256];\
2026
    copy_block17(full, src, 24, stride, 17);\
2027
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2031
}\
2032
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[24*17];\
2034
    uint8_t halfH[272];\
2035
    uint8_t halfHV[256];\
2036
    copy_block17(full, src, 24, stride, 17);\
2037
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2039
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2041
}\
2042
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043
    uint8_t full[24*17];\
2044
    uint8_t halfH[272];\
2045
    uint8_t halfV[256];\
2046
    uint8_t halfHV[256];\
2047
    copy_block17(full, src, 24, stride, 17);\
2048
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2050
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2052
}\
2053
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2054
    uint8_t full[24*17];\
2055
    uint8_t halfH[272];\
2056
    uint8_t halfHV[256];\
2057
    copy_block17(full, src, 24, stride, 17);\
2058
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2059
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2060
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2061
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2062
}\
2063
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2064
    uint8_t full[24*17];\
2065
    uint8_t halfH[272];\
2066
    uint8_t halfV[256];\
2067
    uint8_t halfHV[256];\
2068
    copy_block17(full, src, 24, stride, 17);\
2069
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2070
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2071
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2073
}\
2074
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2075
    uint8_t full[24*17];\
2076
    uint8_t halfH[272];\
2077
    uint8_t halfHV[256];\
2078
    copy_block17(full, src, 24, stride, 17);\
2079
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2080
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2081
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2082
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2083
}\
2084
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2085
    uint8_t full[24*17];\
2086
    uint8_t halfH[272];\
2087
    uint8_t halfV[256];\
2088
    uint8_t halfHV[256];\
2089
    copy_block17(full, src, 24, stride, 17);\
2090
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2091
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2092
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2094
}\
2095
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2096
    uint8_t full[24*17];\
2097
    uint8_t halfH[272];\
2098
    uint8_t halfHV[256];\
2099
    copy_block17(full, src, 24, stride, 17);\
2100
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2101
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2102
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2103
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2104
}\
2105
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2106
    uint8_t halfH[272];\
2107
    uint8_t halfHV[256];\
2108
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2109
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2110
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2111
}\
2112
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2113
    uint8_t halfH[272];\
2114
    uint8_t halfHV[256];\
2115
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2116
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2117
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2118
}\
2119
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2120
    uint8_t full[24*17];\
2121
    uint8_t halfH[272];\
2122
    uint8_t halfV[256];\
2123
    uint8_t halfHV[256];\
2124
    copy_block17(full, src, 24, stride, 17);\
2125
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2127
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2129
}\
2130
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2131
    uint8_t full[24*17];\
2132
    uint8_t halfH[272];\
2133
    copy_block17(full, src, 24, stride, 17);\
2134
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2135
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2136
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2137
}\
2138
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139
    uint8_t full[24*17];\
2140
    uint8_t halfH[272];\
2141
    uint8_t halfV[256];\
2142
    uint8_t halfHV[256];\
2143
    copy_block17(full, src, 24, stride, 17);\
2144
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2146
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2148
}\
2149
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2150
    uint8_t full[24*17];\
2151
    uint8_t halfH[272];\
2152
    copy_block17(full, src, 24, stride, 17);\
2153
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2156
}\
2157
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2158
    uint8_t halfH[272];\
2159
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2160
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2161
}
2162

    
2163
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2164
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2165
#define op_put(a, b) a = cm[((b) + 16)>>5]
2166
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2167

    
2168
QPEL_MC(0, put_       , _       , op_put)
2169
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2170
QPEL_MC(0, avg_       , _       , op_avg)
2171
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2172
#undef op_avg
2173
#undef op_avg_no_rnd
2174
#undef op_put
2175
#undef op_put_no_rnd
2176

    
2177
#if 1
2178
#define H264_LOWPASS(OPNAME, OP, OP2) \
2179
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180
    const int h=2;\
2181
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182
    int i;\
2183
    for(i=0; i<h; i++)\
2184
    {\
2185
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2186
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2187
        dst+=dstStride;\
2188
        src+=srcStride;\
2189
    }\
2190
}\
2191
\
2192
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193
    const int w=2;\
2194
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2195
    int i;\
2196
    for(i=0; i<w; i++)\
2197
    {\
2198
        const int srcB= src[-2*srcStride];\
2199
        const int srcA= src[-1*srcStride];\
2200
        const int src0= src[0 *srcStride];\
2201
        const int src1= src[1 *srcStride];\
2202
        const int src2= src[2 *srcStride];\
2203
        const int src3= src[3 *srcStride];\
2204
        const int src4= src[4 *srcStride];\
2205
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2206
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2207
        dst++;\
2208
        src++;\
2209
    }\
2210
}\
2211
\
2212
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2213
    const int h=2;\
2214
    const int w=2;\
2215
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2216
    int i;\
2217
    src -= 2*srcStride;\
2218
    for(i=0; i<h+5; i++)\
2219
    {\
2220
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2221
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2222
        tmp+=tmpStride;\
2223
        src+=srcStride;\
2224
    }\
2225
    tmp -= tmpStride*(h+5-2);\
2226
    for(i=0; i<w; i++)\
2227
    {\
2228
        const int tmpB= tmp[-2*tmpStride];\
2229
        const int tmpA= tmp[-1*tmpStride];\
2230
        const int tmp0= tmp[0 *tmpStride];\
2231
        const int tmp1= tmp[1 *tmpStride];\
2232
        const int tmp2= tmp[2 *tmpStride];\
2233
        const int tmp3= tmp[3 *tmpStride];\
2234
        const int tmp4= tmp[4 *tmpStride];\
2235
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2236
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2237
        dst++;\
2238
        tmp++;\
2239
    }\
2240
}\
2241
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2242
    const int h=4;\
2243
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2244
    int i;\
2245
    for(i=0; i<h; i++)\
2246
    {\
2247
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2248
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2249
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2250
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2251
        dst+=dstStride;\
2252
        src+=srcStride;\
2253
    }\
2254
}\
2255
\
2256
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2257
    const int w=4;\
2258
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2259
    int i;\
2260
    for(i=0; i<w; i++)\
2261
    {\
2262
        const int srcB= src[-2*srcStride];\
2263
        const int srcA= src[-1*srcStride];\
2264
        const int src0= src[0 *srcStride];\
2265
        const int src1= src[1 *srcStride];\
2266
        const int src2= src[2 *srcStride];\
2267
        const int src3= src[3 *srcStride];\
2268
        const int src4= src[4 *srcStride];\
2269
        const int src5= src[5 *srcStride];\
2270
        const int src6= src[6 *srcStride];\
2271
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2272
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2273
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2274
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2275
        dst++;\
2276
        src++;\
2277
    }\
2278
}\
2279
\
2280
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2281
    const int h=4;\
2282
    const int w=4;\
2283
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284
    int i;\
2285
    src -= 2*srcStride;\
2286
    for(i=0; i<h+5; i++)\
2287
    {\
2288
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2289
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2290
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2291
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2292
        tmp+=tmpStride;\
2293
        src+=srcStride;\
2294
    }\
2295
    tmp -= tmpStride*(h+5-2);\
2296
    for(i=0; i<w; i++)\
2297
    {\
2298
        const int tmpB= tmp[-2*tmpStride];\
2299
        const int tmpA= tmp[-1*tmpStride];\
2300
        const int tmp0= tmp[0 *tmpStride];\
2301
        const int tmp1= tmp[1 *tmpStride];\
2302
        const int tmp2= tmp[2 *tmpStride];\
2303
        const int tmp3= tmp[3 *tmpStride];\
2304
        const int tmp4= tmp[4 *tmpStride];\
2305
        const int tmp5= tmp[5 *tmpStride];\
2306
        const int tmp6= tmp[6 *tmpStride];\
2307
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2308
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2309
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2310
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2311
        dst++;\
2312
        tmp++;\
2313
    }\
2314
}\
2315
\
2316
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2317
    const int h=8;\
2318
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2319
    int i;\
2320
    for(i=0; i<h; i++)\
2321
    {\
2322
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2323
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2324
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2325
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2326
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2327
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2328
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2329
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2330
        dst+=dstStride;\
2331
        src+=srcStride;\
2332
    }\
2333
}\
2334
\
2335
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2336
    const int w=8;\
2337
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338
    int i;\
2339
    for(i=0; i<w; i++)\
2340
    {\
2341
        const int srcB= src[-2*srcStride];\
2342
        const int srcA= src[-1*srcStride];\
2343
        const int src0= src[0 *srcStride];\
2344
        const int src1= src[1 *srcStride];\
2345
        const int src2= src[2 *srcStride];\
2346
        const int src3= src[3 *srcStride];\
2347
        const int src4= src[4 *srcStride];\
2348
        const int src5= src[5 *srcStride];\
2349
        const int src6= src[6 *srcStride];\
2350
        const int src7= src[7 *srcStride];\
2351
        const int src8= src[8 *srcStride];\
2352
        const int src9= src[9 *srcStride];\
2353
        const int src10=src[10*srcStride];\
2354
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2355
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2356
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2357
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2358
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2359
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2360
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2361
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2362
        dst++;\
2363
        src++;\
2364
    }\
2365
}\
2366
\
2367
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2368
    const int h=8;\
2369
    const int w=8;\
2370
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2371
    int i;\
2372
    src -= 2*srcStride;\
2373
    for(i=0; i<h+5; i++)\
2374
    {\
2375
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2376
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2377
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2378
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2379
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2380
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2381
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2382
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2383
        tmp+=tmpStride;\
2384
        src+=srcStride;\
2385
    }\
2386
    tmp -= tmpStride*(h+5-2);\
2387
    for(i=0; i<w; i++)\
2388
    {\
2389
        const int tmpB= tmp[-2*tmpStride];\
2390
        const int tmpA= tmp[-1*tmpStride];\
2391
        const int tmp0= tmp[0 *tmpStride];\
2392
        const int tmp1= tmp[1 *tmpStride];\
2393
        const int tmp2= tmp[2 *tmpStride];\
2394
        const int tmp3= tmp[3 *tmpStride];\
2395
        const int tmp4= tmp[4 *tmpStride];\
2396
        const int tmp5= tmp[5 *tmpStride];\
2397
        const int tmp6= tmp[6 *tmpStride];\
2398
        const int tmp7= tmp[7 *tmpStride];\
2399
        const int tmp8= tmp[8 *tmpStride];\
2400
        const int tmp9= tmp[9 *tmpStride];\
2401
        const int tmp10=tmp[10*tmpStride];\
2402
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2403
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2404
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2405
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2406
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2407
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2408
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2409
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2410
        dst++;\
2411
        tmp++;\
2412
    }\
2413
}\
2414
\
2415
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2416
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2417
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2418
    src += 8*srcStride;\
2419
    dst += 8*dstStride;\
2420
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2421
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2422
}\
2423
\
2424
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2425
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2426
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2427
    src += 8*srcStride;\
2428
    dst += 8*dstStride;\
2429
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2430
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2431
}\
2432
\
2433
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2434
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2435
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2436
    src += 8*srcStride;\
2437
    dst += 8*dstStride;\
2438
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2439
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2440
}\
2441

    
2442
#define H264_MC(OPNAME, SIZE) \
2443
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2444
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2445
}\
2446
\
2447
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2448
    uint8_t half[SIZE*SIZE];\
2449
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2450
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2451
}\
2452
\
2453
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2454
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2455
}\
2456
\
2457
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2458
    uint8_t half[SIZE*SIZE];\
2459
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2460
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2461
}\
2462
\
2463
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2464
    uint8_t full[SIZE*(SIZE+5)];\
2465
    uint8_t * const full_mid= full + SIZE*2;\
2466
    uint8_t half[SIZE*SIZE];\
2467
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2468
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2469
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    uint8_t full[SIZE*(SIZE+5)];\
2474
    uint8_t * const full_mid= full + SIZE*2;\
2475
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2476
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2477
}\
2478
\
2479
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2480
    uint8_t full[SIZE*(SIZE+5)];\
2481
    uint8_t * const full_mid= full + SIZE*2;\
2482
    uint8_t half[SIZE*SIZE];\
2483
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2484
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2485
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2486
}\
2487
\
2488
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2489
    uint8_t full[SIZE*(SIZE+5)];\
2490
    uint8_t * const full_mid= full + SIZE*2;\
2491
    uint8_t halfH[SIZE*SIZE];\
2492
    uint8_t halfV[SIZE*SIZE];\
2493
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2494
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2495
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2496
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2497
}\
2498
\
2499
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2500
    uint8_t full[SIZE*(SIZE+5)];\
2501
    uint8_t * const full_mid= full + SIZE*2;\
2502
    uint8_t halfH[SIZE*SIZE];\
2503
    uint8_t halfV[SIZE*SIZE];\
2504
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2505
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2506
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2507
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2508
}\
2509
\
2510
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2511
    uint8_t full[SIZE*(SIZE+5)];\
2512
    uint8_t * const full_mid= full + SIZE*2;\
2513
    uint8_t halfH[SIZE*SIZE];\
2514
    uint8_t halfV[SIZE*SIZE];\
2515
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2516
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2517
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2518
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2519
}\
2520
\
2521
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2522
    uint8_t full[SIZE*(SIZE+5)];\
2523
    uint8_t * const full_mid= full + SIZE*2;\
2524
    uint8_t halfH[SIZE*SIZE];\
2525
    uint8_t halfV[SIZE*SIZE];\
2526
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2527
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2528
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2530
}\
2531
\
2532
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2533
    int16_t tmp[SIZE*(SIZE+5)];\
2534
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2535
}\
2536
\
2537
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2538
    int16_t tmp[SIZE*(SIZE+5)];\
2539
    uint8_t halfH[SIZE*SIZE];\
2540
    uint8_t halfHV[SIZE*SIZE];\
2541
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2542
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2543
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2544
}\
2545
\
2546
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2547
    int16_t tmp[SIZE*(SIZE+5)];\
2548
    uint8_t halfH[SIZE*SIZE];\
2549
    uint8_t halfHV[SIZE*SIZE];\
2550
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2551
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2552
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2553
}\
2554
\
2555
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2556
    uint8_t full[SIZE*(SIZE+5)];\
2557
    uint8_t * const full_mid= full + SIZE*2;\
2558
    int16_t tmp[SIZE*(SIZE+5)];\
2559
    uint8_t halfV[SIZE*SIZE];\
2560
    uint8_t halfHV[SIZE*SIZE];\
2561
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2562
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2563
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2564
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2565
}\
2566
\
2567
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2568
    uint8_t full[SIZE*(SIZE+5)];\
2569
    uint8_t * const full_mid= full + SIZE*2;\
2570
    int16_t tmp[SIZE*(SIZE+5)];\
2571
    uint8_t halfV[SIZE*SIZE];\
2572
    uint8_t halfHV[SIZE*SIZE];\
2573
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2574
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2575
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2576
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2577
}\
2578

    
2579
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2580
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2581
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2582
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2583
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2584

    
2585
H264_LOWPASS(put_       , op_put, op2_put)
2586
H264_LOWPASS(avg_       , op_avg, op2_avg)
2587
H264_MC(put_, 2)
2588
H264_MC(put_, 4)
2589
H264_MC(put_, 8)
2590
H264_MC(put_, 16)
2591
H264_MC(avg_, 4)
2592
H264_MC(avg_, 8)
2593
H264_MC(avg_, 16)
2594

    
2595
#undef op_avg
2596
#undef op_put
2597
#undef op2_avg
2598
#undef op2_put
2599
#endif
2600

    
2601
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2602
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2603
    int i;
2604

    
2605
    for(i=0; i<h; i++){
2606
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2607
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2608
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2609
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2610
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2611
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2612
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2613
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2614
        dst+=dstStride;
2615
        src+=srcStride;
2616
    }
2617
}
2618

    
2619
#if CONFIG_CAVS_DECODER
2620
/* AVS specific */
2621
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2622
    put_pixels8_c(dst, src, stride, 8);
2623
}
2624
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2625
    avg_pixels8_c(dst, src, stride, 8);
2626
}
2627
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2628
    put_pixels16_c(dst, src, stride, 16);
2629
}
2630
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2631
    avg_pixels16_c(dst, src, stride, 16);
2632
}
2633
#endif /* CONFIG_CAVS_DECODER */
2634

    
2635
#if CONFIG_VC1_DECODER
2636
/* VC-1 specific */
2637
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2638
    put_pixels8_c(dst, src, stride, 8);
2639
}
2640
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2641
    avg_pixels8_c(dst, src, stride, 8);
2642
}
2643
#endif /* CONFIG_VC1_DECODER */
2644

    
2645
#if CONFIG_RV40_DECODER
2646
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2647
    put_pixels16_xy2_c(dst, src, stride, 16);
2648
}
2649
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2650
    avg_pixels16_xy2_c(dst, src, stride, 16);
2651
}
2652
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2653
    put_pixels8_xy2_c(dst, src, stride, 8);
2654
}
2655
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2656
    avg_pixels8_xy2_c(dst, src, stride, 8);
2657
}
2658
#endif /* CONFIG_RV40_DECODER */
2659

    
2660
#if CONFIG_VP8_DECODER
2661
void ff_put_vp8_pixels16_c(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) {
2662
    put_pixels16_c(dst, src, stride, h);
2663
}
2664
void ff_put_vp8_pixels8_c(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) {
2665
    put_pixels8_c(dst, src, stride, h);
2666
}
2667
void ff_put_vp8_pixels4_c(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) {
2668
    put_pixels4_c(dst, src, stride, h);
2669
}
2670
#endif
2671

    
2672
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2673
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2674
    int i;
2675

    
2676
    for(i=0; i<w; i++){
2677
        const int src_1= src[ -srcStride];
2678
        const int src0 = src[0          ];
2679
        const int src1 = src[  srcStride];
2680
        const int src2 = src[2*srcStride];
2681
        const int src3 = src[3*srcStride];
2682
        const int src4 = src[4*srcStride];
2683
        const int src5 = src[5*srcStride];
2684
        const int src6 = src[6*srcStride];
2685
        const int src7 = src[7*srcStride];
2686
        const int src8 = src[8*srcStride];
2687
        const int src9 = src[9*srcStride];
2688
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2689
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2690
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2691
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2692
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2693
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2694
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2695
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2696
        src++;
2697
        dst++;
2698
    }
2699
}
2700

    
2701
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2702
    put_pixels8_c(dst, src, stride, 8);
2703
}
2704

    
2705
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2706
    uint8_t half[64];
2707
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2708
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2709
}
2710

    
2711
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2712
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2713
}
2714

    
2715
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2716
    uint8_t half[64];
2717
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2718
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2719
}
2720

    
2721
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2722
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2723
}
2724

    
2725
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2726
    uint8_t halfH[88];
2727
    uint8_t halfV[64];
2728
    uint8_t halfHV[64];
2729
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2730
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2731
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2732
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2733
}
2734
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2735
    uint8_t halfH[88];
2736
    uint8_t halfV[64];
2737
    uint8_t halfHV[64];
2738
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2739
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2740
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2741
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2742
}
2743
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2744
    uint8_t halfH[88];
2745
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2746
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2747
}
2748

    
2749
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2750
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2751
    int x;
2752
    const int strength= ff_h263_loop_filter_strength[qscale];
2753

    
2754
    for(x=0; x<8; x++){
2755
        int d1, d2, ad1;
2756
        int p0= src[x-2*stride];
2757
        int p1= src[x-1*stride];
2758
        int p2= src[x+0*stride];
2759
        int p3= src[x+1*stride];
2760
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2761

    
2762
        if     (d<-2*strength) d1= 0;
2763
        else if(d<-  strength) d1=-2*strength - d;
2764
        else if(d<   strength) d1= d;
2765
        else if(d< 2*strength) d1= 2*strength - d;
2766
        else                   d1= 0;
2767

    
2768
        p1 += d1;
2769
        p2 -= d1;
2770
        if(p1&256) p1= ~(p1>>31);
2771
        if(p2&256) p2= ~(p2>>31);
2772

    
2773
        src[x-1*stride] = p1;
2774
        src[x+0*stride] = p2;
2775

    
2776
        ad1= FFABS(d1)>>1;
2777

    
2778
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2779

    
2780
        src[x-2*stride] = p0 - d2;
2781
        src[x+  stride] = p3 + d2;
2782
    }
2783
    }
2784
}
2785

    
2786
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2787
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2788
    int y;
2789
    const int strength= ff_h263_loop_filter_strength[qscale];
2790

    
2791
    for(y=0; y<8; y++){
2792
        int d1, d2, ad1;
2793
        int p0= src[y*stride-2];
2794
        int p1= src[y*stride-1];
2795
        int p2= src[y*stride+0];
2796
        int p3= src[y*stride+1];
2797
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2798

    
2799
        if     (d<-2*strength) d1= 0;
2800
        else if(d<-  strength) d1=-2*strength - d;
2801
        else if(d<   strength) d1= d;
2802
        else if(d< 2*strength) d1= 2*strength - d;
2803
        else                   d1= 0;
2804

    
2805
        p1 += d1;
2806
        p2 -= d1;
2807
        if(p1&256) p1= ~(p1>>31);
2808
        if(p2&256) p2= ~(p2>>31);
2809

    
2810
        src[y*stride-1] = p1;
2811
        src[y*stride+0] = p2;
2812

    
2813
        ad1= FFABS(d1)>>1;
2814

    
2815
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2816

    
2817
        src[y*stride-2] = p0 - d2;
2818
        src[y*stride+1] = p3 + d2;
2819
    }
2820
    }
2821
}
2822

    
2823
static void h261_loop_filter_c(uint8_t *src, int stride){
2824
    int x,y,xy,yz;
2825
    int temp[64];
2826

    
2827
    for(x=0; x<8; x++){
2828
        temp[x      ] = 4*src[x           ];
2829
        temp[x + 7*8] = 4*src[x + 7*stride];
2830
    }
2831
    for(y=1; y<7; y++){
2832
        for(x=0; x<8; x++){
2833
            xy = y * stride + x;
2834
            yz = y * 8 + x;
2835
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2836
        }
2837
    }
2838

    
2839
    for(y=0; y<8; y++){
2840
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2841
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2842
        for(x=1; x<7; x++){
2843
            xy = y * stride + x;
2844
            yz = y * 8 + x;
2845
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2846
        }
2847
    }
2848
}
2849

    
2850
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2851
{
2852
    int s, i;
2853

    
2854
    s = 0;
2855
    for(i=0;i<h;i++) {
2856
        s += abs(pix1[0] - pix2[0]);
2857
        s += abs(pix1[1] - pix2[1]);
2858
        s += abs(pix1[2] - pix2[2]);
2859
        s += abs(pix1[3] - pix2[3]);
2860
        s += abs(pix1[4] - pix2[4]);
2861
        s += abs(pix1[5] - pix2[5]);
2862
        s += abs(pix1[6] - pix2[6]);
2863
        s += abs(pix1[7] - pix2[7]);
2864
        s += abs(pix1[8] - pix2[8]);
2865
        s += abs(pix1[9] - pix2[9]);
2866
        s += abs(pix1[10] - pix2[10]);
2867
        s += abs(pix1[11] - pix2[11]);
2868
        s += abs(pix1[12] - pix2[12]);
2869
        s += abs(pix1[13] - pix2[13]);
2870
        s += abs(pix1[14] - pix2[14]);
2871
        s += abs(pix1[15] - pix2[15]);
2872
        pix1 += line_size;
2873
        pix2 += line_size;
2874
    }
2875
    return s;
2876
}
2877

    
2878
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2879
{
2880
    int s, i;
2881

    
2882
    s = 0;
2883
    for(i=0;i<h;i++) {
2884
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2885
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2886
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2887
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2888
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2889
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2890
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2891
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2892
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2893
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2894
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2895
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2896
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2897
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2898
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2899
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2900
        pix1 += line_size;
2901
        pix2 += line_size;
2902
    }
2903
    return s;
2904
}
2905

    
2906
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2907
{
2908
    int s, i;
2909
    uint8_t *pix3 = pix2 + line_size;
2910

    
2911
    s = 0;
2912
    for(i=0;i<h;i++) {
2913
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2914
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2915
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2916
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2917
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2918
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2919
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2920
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2921
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2922
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2923
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2924
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2925
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2926
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2927
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2928
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2929
        pix1 += line_size;
2930
        pix2 += line_size;
2931
        pix3 += line_size;
2932
    }
2933
    return s;
2934
}
2935

    
2936
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2937
{
2938
    int s, i;
2939
    uint8_t *pix3 = pix2 + line_size;
2940

    
2941
    s = 0;
2942
    for(i=0;i<h;i++) {
2943
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2944
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2945
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2946
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2947
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2948
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2949
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2950
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2951
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2952
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2953
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2954
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2955
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2956
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2957
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2958
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2959
        pix1 += line_size;
2960
        pix2 += line_size;
2961
        pix3 += line_size;
2962
    }
2963
    return s;
2964
}
2965

    
2966
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2967
{
2968
    int s, i;
2969

    
2970
    s = 0;
2971
    for(i=0;i<h;i++) {
2972
        s += abs(pix1[0] - pix2[0]);
2973
        s += abs(pix1[1] - pix2[1]);
2974
        s += abs(pix1[2] - pix2[2]);
2975
        s += abs(pix1[3] - pix2[3]);
2976
        s += abs(pix1[4] - pix2[4]);
2977
        s += abs(pix1[5] - pix2[5]);
2978
        s += abs(pix1[6] - pix2[6]);
2979
        s += abs(pix1[7] - pix2[7]);
2980
        pix1 += line_size;
2981
        pix2 += line_size;
2982
    }
2983
    return s;
2984
}
2985

    
2986
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2987
{
2988
    int s, i;
2989

    
2990
    s = 0;
2991
    for(i=0;i<h;i++) {
2992
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2993
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2994
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2995
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2996
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2997
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2998
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2999
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3000
        pix1 += line_size;
3001
        pix2 += line_size;
3002
    }
3003
    return s;
3004
}
3005

    
3006
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3007
{
3008
    int s, i;
3009
    uint8_t *pix3 = pix2 + line_size;
3010

    
3011
    s = 0;
3012
    for(i=0;i<h;i++) {
3013
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3014
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3015
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3016
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3017
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3018
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3019
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3020
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3021
        pix1 += line_size;
3022
        pix2 += line_size;
3023
        pix3 += line_size;
3024
    }
3025
    return s;
3026
}
3027

    
3028
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3029
{
3030
    int s, i;
3031
    uint8_t *pix3 = pix2 + line_size;
3032

    
3033
    s = 0;
3034
    for(i=0;i<h;i++) {
3035
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3036
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3037
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3038
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3039
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3040
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3041
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3042
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3043
        pix1 += line_size;
3044
        pix2 += line_size;
3045
        pix3 += line_size;
3046
    }
3047
    return s;
3048
}
3049

    
3050
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3051
    MpegEncContext *c = v;
3052
    int score1=0;
3053
    int score2=0;
3054
    int x,y;
3055

    
3056
    for(y=0; y<h; y++){
3057
        for(x=0; x<16; x++){
3058
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3059
        }
3060
        if(y+1<h){
3061
            for(x=0; x<15; x++){
3062
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3063
                             - s1[x+1] + s1[x+1+stride])
3064
                        -FFABS(  s2[x  ] - s2[x  +stride]
3065
                             - s2[x+1] + s2[x+1+stride]);
3066
            }
3067
        }
3068
        s1+= stride;
3069
        s2+= stride;
3070
    }
3071

    
3072
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3073
    else  return score1 + FFABS(score2)*8;
3074
}
3075

    
3076
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3077
    MpegEncContext *c = v;
3078
    int score1=0;
3079
    int score2=0;
3080
    int x,y;
3081

    
3082
    for(y=0; y<h; y++){
3083
        for(x=0; x<8; x++){
3084
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3085
        }
3086
        if(y+1<h){
3087
            for(x=0; x<7; x++){
3088
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3089
                             - s1[x+1] + s1[x+1+stride])
3090
                        -FFABS(  s2[x  ] - s2[x  +stride]
3091
                             - s2[x+1] + s2[x+1+stride]);
3092
            }
3093
        }
3094
        s1+= stride;
3095
        s2+= stride;
3096
    }
3097

    
3098
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3099
    else  return score1 + FFABS(score2)*8;
3100
}
3101

    
3102
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3103
    int i;
3104
    unsigned int sum=0;
3105

    
3106
    for(i=0; i<8*8; i++){
3107
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3108
        int w= weight[i];
3109
        b>>= RECON_SHIFT;
3110
        assert(-512<b && b<512);
3111

    
3112
        sum += (w*b)*(w*b)>>4;
3113
    }
3114
    return sum>>2;
3115
}
3116

    
3117
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3118
    int i;
3119

    
3120
    for(i=0; i<8*8; i++){
3121
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3122
    }
3123
}
3124

    
3125
/**
3126
 * permutes an 8x8 block.
3127
 * @param block the block which will be permuted according to the given permutation vector
3128
 * @param permutation the permutation vector
3129
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3130
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3131
 *                  (inverse) permutated to scantable order!
3132
 */
3133
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3134
{
3135
    int i;
3136
    DCTELEM temp[64];
3137

    
3138
    if(last<=0) return;
3139
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3140

    
3141
    for(i=0; i<=last; i++){
3142
        const int j= scantable[i];
3143
        temp[j]= block[j];
3144
        block[j]=0;
3145
    }
3146

    
3147
    for(i=0; i<=last; i++){
3148
        const int j= scantable[i];
3149
        const int perm_j= permutation[j];
3150
        block[perm_j]= temp[j];
3151
    }
3152
}
3153

    
3154
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3155
    return 0;
3156
}
3157

    
3158
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3159
    int i;
3160

    
3161
    memset(cmp, 0, sizeof(void*)*6);
3162

    
3163
    for(i=0; i<6; i++){
3164
        switch(type&0xFF){
3165
        case FF_CMP_SAD:
3166
            cmp[i]= c->sad[i];
3167
            break;
3168
        case FF_CMP_SATD:
3169
            cmp[i]= c->hadamard8_diff[i];
3170
            break;
3171
        case FF_CMP_SSE:
3172
            cmp[i]= c->sse[i];
3173
            break;
3174
        case FF_CMP_DCT:
3175
            cmp[i]= c->dct_sad[i];
3176
            break;
3177
        case FF_CMP_DCT264:
3178
            cmp[i]= c->dct264_sad[i];
3179
            break;
3180
        case FF_CMP_DCTMAX:
3181
            cmp[i]= c->dct_max[i];
3182
            break;
3183
        case FF_CMP_PSNR:
3184
            cmp[i]= c->quant_psnr[i];
3185
            break;
3186
        case FF_CMP_BIT:
3187
            cmp[i]= c->bit[i];
3188
            break;
3189
        case FF_CMP_RD:
3190
            cmp[i]= c->rd[i];
3191
            break;
3192
        case FF_CMP_VSAD:
3193
            cmp[i]= c->vsad[i];
3194
            break;
3195
        case FF_CMP_VSSE:
3196
            cmp[i]= c->vsse[i];
3197
            break;
3198
        case FF_CMP_ZERO:
3199
            cmp[i]= zero_cmp;
3200
            break;
3201
        case FF_CMP_NSSE:
3202
            cmp[i]= c->nsse[i];
3203
            break;
3204
#if CONFIG_DWT
3205
        case FF_CMP_W53:
3206
            cmp[i]= c->w53[i];
3207
            break;
3208
        case FF_CMP_W97:
3209
            cmp[i]= c->w97[i];
3210
            break;
3211
#endif
3212
        default:
3213
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3214
        }
3215
    }
3216
}
3217

    
3218
static void clear_block_c(DCTELEM *block)
3219
{
3220
    memset(block, 0, sizeof(DCTELEM)*64);
3221
}
3222

    
3223
/**
3224
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3225
 */
3226
static void clear_blocks_c(DCTELEM *blocks)
3227
{
3228
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3229
}
3230

    
3231
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3232
    long i;
3233
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3234
        long a = *(long*)(src+i);
3235
        long b = *(long*)(dst+i);
3236
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3237
    }
3238
    for(; i<w; i++)
3239
        dst[i+0] += src[i+0];
3240
}
3241

    
3242
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3243
    long i;
3244
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3245
        long a = *(long*)(src1+i);
3246
        long b = *(long*)(src2+i);
3247
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3248
    }
3249
    for(; i<w; i++)
3250
        dst[i] = src1[i]+src2[i];
3251
}
3252

    
3253
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3254
    long i;
3255
#if !HAVE_FAST_UNALIGNED
3256
    if((long)src2 & (sizeof(long)-1)){
3257
        for(i=0; i+7<w; i+=8){
3258
            dst[i+0] = src1[i+0]-src2[i+0];
3259
            dst[i+1] = src1[i+1]-src2[i+1];
3260
            dst[i+2] = src1[i+2]-src2[i+2];
3261
            dst[i+3] = src1[i+3]-src2[i+3];
3262
            dst[i+4] = src1[i+4]-src2[i+4];
3263
            dst[i+5] = src1[i+5]-src2[i+5];
3264
            dst[i+6] = src1[i+6]-src2[i+6];
3265
            dst[i+7] = src1[i+7]-src2[i+7];
3266
        }
3267
    }else
3268
#endif
3269
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3270
        long a = *(long*)(src1+i);
3271
        long b = *(long*)(src2+i);
3272
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3273
    }
3274
    for(; i<w; i++)
3275
        dst[i+0] = src1[i+0]-src2[i+0];
3276
}
3277

    
3278
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3279
    int i;
3280
    uint8_t l, lt;
3281

    
3282
    l= *left;
3283
    lt= *left_top;
3284

    
3285
    for(i=0; i<w; i++){
3286
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3287
        lt= src1[i];
3288
        dst[i]= l;
3289
    }
3290

    
3291
    *left= l;
3292
    *left_top= lt;
3293
}
3294

    
3295
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3296
    int i;
3297
    uint8_t l, lt;
3298

    
3299
    l= *left;
3300
    lt= *left_top;
3301

    
3302
    for(i=0; i<w; i++){
3303
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3304
        lt= src1[i];
3305
        l= src2[i];
3306
        dst[i]= l - pred;
3307
    }
3308

    
3309
    *left= l;
3310
    *left_top= lt;
3311
}
3312

    
3313
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3314
    int i;
3315

    
3316
    for(i=0; i<w-1; i++){
3317
        acc+= src[i];
3318
        dst[i]= acc;
3319
        i++;
3320
        acc+= src[i];
3321
        dst[i]= acc;
3322
    }
3323

    
3324
    for(; i<w; i++){
3325
        acc+= src[i];
3326
        dst[i]= acc;
3327
    }
3328

    
3329
    return acc;
3330
}
3331

    
3332
#if HAVE_BIGENDIAN
3333
#define B 3
3334
#define G 2
3335
#define R 1
3336
#define A 0
3337
#else
3338
#define B 0
3339
#define G 1
3340
#define R 2
3341
#define A 3
3342
#endif
3343
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3344
    int i;
3345
    int r,g,b,a;
3346
    r= *red;
3347
    g= *green;
3348
    b= *blue;
3349
    a= *alpha;
3350

    
3351
    for(i=0; i<w; i++){
3352
        b+= src[4*i+B];
3353
        g+= src[4*i+G];
3354
        r+= src[4*i+R];
3355
        a+= src[4*i+A];
3356

    
3357
        dst[4*i+B]= b;
3358
        dst[4*i+G]= g;
3359
        dst[4*i+R]= r;
3360
        dst[4*i+A]= a;
3361
    }
3362

    
3363
    *red= r;
3364
    *green= g;
3365
    *blue= b;
3366
    *alpha= a;
3367
}
3368
#undef B
3369
#undef G
3370
#undef R
3371
#undef A
3372

    
3373
#define BUTTERFLY2(o1,o2,i1,i2) \
3374
o1= (i1)+(i2);\
3375
o2= (i1)-(i2);
3376

    
3377
#define BUTTERFLY1(x,y) \
3378
{\
3379
    int a,b;\
3380
    a= x;\
3381
    b= y;\
3382
    x= a+b;\
3383
    y= a-b;\
3384
}
3385

    
3386
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3387

    
3388
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3389
    int i;
3390
    int temp[64];
3391
    int sum=0;
3392

    
3393
    assert(h==8);
3394

    
3395
    for(i=0; i<8; i++){
3396
        //FIXME try pointer walks
3397
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3398
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3399
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3400
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3401

    
3402
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3403
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3404
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3405
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3406

    
3407
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3408
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3409
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3410
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3411
    }
3412

    
3413
    for(i=0; i<8; i++){
3414
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3415
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3416
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3417
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3418

    
3419
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3420
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3421
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3422
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3423

    
3424
        sum +=
3425
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3426
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3427
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3428
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3429
    }
3430
#if 0
3431
static int maxi=0;
3432
if(sum>maxi){
3433
    maxi=sum;
3434
    printf("MAX:%d\n", maxi);
3435
}
3436
#endif
3437
    return sum;
3438
}
3439

    
3440
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3441
    int i;
3442
    int temp[64];
3443
    int sum=0;
3444

    
3445
    assert(h==8);
3446

    
3447
    for(i=0; i<8; i++){
3448
        //FIXME try pointer walks
3449
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3450
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3451
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3452
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3453

    
3454
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3455
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3456
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3457
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3458

    
3459
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3460
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3461
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3462
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3463
    }
3464

    
3465
    for(i=0; i<8; i++){
3466
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3467
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3468
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3469
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3470

    
3471
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3472
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3473
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3474
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3475

    
3476
        sum +=
3477
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3478
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3479
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3480
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3481
    }
3482

    
3483
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3484

    
3485
    return sum;
3486
}
3487

    
3488
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3489
    MpegEncContext * const s= (MpegEncContext *)c;
3490
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3491

    
3492
    assert(h==8);
3493

    
3494
    s->dsp.diff_pixels(temp, src1, src2, stride);
3495
    s->dsp.fdct(temp);
3496
    return s->dsp.sum_abs_dctelem(temp);
3497
}
3498

    
3499
#if CONFIG_GPL
3500
#define DCT8_1D {\
3501
    const int s07 = SRC(0) + SRC(7);\
3502
    const int s16 = SRC(1) + SRC(6);\
3503
    const int s25 = SRC(2) + SRC(5);\
3504
    const int s34 = SRC(3) + SRC(4);\
3505
    const int a0 = s07 + s34;\
3506
    const int a1 = s16 + s25;\
3507
    const int a2 = s07 - s34;\
3508
    const int a3 = s16 - s25;\
3509
    const int d07 = SRC(0) - SRC(7);\
3510
    const int d16 = SRC(1) - SRC(6);\
3511
    const int d25 = SRC(2) - SRC(5);\
3512
    const int d34 = SRC(3) - SRC(4);\
3513
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3514
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3515
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3516
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3517
    DST(0,  a0 + a1     ) ;\
3518
    DST(1,  a4 + (a7>>2)) ;\
3519
    DST(2,  a2 + (a3>>1)) ;\
3520
    DST(3,  a5 + (a6>>2)) ;\
3521
    DST(4,  a0 - a1     ) ;\
3522
    DST(5,  a6 - (a5>>2)) ;\
3523
    DST(6, (a2>>1) - a3 ) ;\
3524
    DST(7, (a4>>2) - a7 ) ;\
3525
}
3526

    
3527
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3528
    MpegEncContext * const s= (MpegEncContext *)c;
3529
    DCTELEM dct[8][8];
3530
    int i;
3531
    int sum=0;
3532

    
3533
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3534

    
3535
#define SRC(x) dct[i][x]
3536
#define DST(x,v) dct[i][x]= v
3537
    for( i = 0; i < 8; i++ )
3538
        DCT8_1D
3539
#undef SRC
3540
#undef DST
3541

    
3542
#define SRC(x) dct[x][i]
3543
#define DST(x,v) sum += FFABS(v)
3544
    for( i = 0; i < 8; i++ )
3545
        DCT8_1D
3546
#undef SRC
3547
#undef DST
3548
    return sum;
3549
}
3550
#endif
3551

    
3552
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3553
    MpegEncContext * const s= (MpegEncContext *)c;
3554
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3555
    int sum=0, i;
3556

    
3557
    assert(h==8);
3558

    
3559
    s->dsp.diff_pixels(temp, src1, src2, stride);
3560
    s->dsp.fdct(temp);
3561

    
3562
    for(i=0; i<64; i++)
3563
        sum= FFMAX(sum, FFABS(temp[i]));
3564

    
3565
    return sum;
3566
}
3567

    
3568
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3569
    MpegEncContext * const s= (MpegEncContext *)c;
3570
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3571
    DCTELEM * const bak = temp+64;
3572
    int sum=0, i;
3573

    
3574
    assert(h==8);
3575
    s->mb_intra=0;
3576

    
3577
    s->dsp.diff_pixels(temp, src1, src2, stride);
3578

    
3579
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3580

    
3581
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3582
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3583
    ff_simple_idct(temp); //FIXME
3584

    
3585
    for(i=0; i<64; i++)
3586
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3587

    
3588
    return sum;
3589
}
3590

    
3591
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3592
    MpegEncContext * const s= (MpegEncContext *)c;
3593
    const uint8_t *scantable= s->intra_scantable.permutated;
3594
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3595
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3596
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3597
    int i, last, run, bits, level, distortion, start_i;
3598
    const int esc_length= s->ac_esc_length;
3599
    uint8_t * length;
3600
    uint8_t * last_length;
3601

    
3602
    assert(h==8);
3603

    
3604
    copy_block8(lsrc1, src1, 8, stride, 8);
3605
    copy_block8(lsrc2, src2, 8, stride, 8);
3606

    
3607
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3608

    
3609
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3610

    
3611
    bits=0;
3612

    
3613
    if (s->mb_intra) {
3614
        start_i = 1;
3615
        length     = s->intra_ac_vlc_length;
3616
        last_length= s->intra_ac_vlc_last_length;
3617
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3618
    } else {
3619
        start_i = 0;
3620
        length     = s->inter_ac_vlc_length;
3621
        last_length= s->inter_ac_vlc_last_length;
3622
    }
3623

    
3624
    if(last>=start_i){
3625
        run=0;
3626
        for(i=start_i; i<last; i++){
3627
            int j= scantable[i];
3628
            level= temp[j];
3629

    
3630
            if(level){
3631
                level+=64;
3632
                if((level&(~127)) == 0){
3633
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3634
                }else
3635
                    bits+= esc_length;
3636
                run=0;
3637
            }else
3638
                run++;
3639
        }
3640
        i= scantable[last];
3641

    
3642
        level= temp[i] + 64;
3643

    
3644
        assert(level - 64);
3645

    
3646
        if((level&(~127)) == 0){
3647
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3648
        }else
3649
            bits+= esc_length;
3650

    
3651
    }
3652

    
3653
    if(last>=0){
3654
        if(s->mb_intra)
3655
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3656
        else
3657
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3658
    }
3659

    
3660
    s->dsp.idct_add(lsrc2, 8, temp);
3661

    
3662
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3663

    
3664
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3665
}
3666

    
3667
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3668
    MpegEncContext * const s= (MpegEncContext *)c;
3669
    const uint8_t *scantable= s->intra_scantable.permutated;
3670
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3671
    int i, last, run, bits, level, start_i;
3672
    const int esc_length= s->ac_esc_length;
3673
    uint8_t * length;
3674
    uint8_t * last_length;
3675

    
3676
    assert(h==8);
3677

    
3678
    s->dsp.diff_pixels(temp, src1, src2, stride);
3679

    
3680
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3681

    
3682
    bits=0;
3683

    
3684
    if (s->mb_intra) {
3685
        start_i = 1;
3686
        length     = s->intra_ac_vlc_length;
3687
        last_length= s->intra_ac_vlc_last_length;
3688
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3689
    } else {
3690
        start_i = 0;
3691
        length     = s->inter_ac_vlc_length;
3692
        last_length= s->inter_ac_vlc_last_length;
3693
    }
3694

    
3695
    if(last>=start_i){
3696
        run=0;
3697
        for(i=start_i; i<last; i++){
3698
            int j= scantable[i];
3699
            level= temp[j];
3700

    
3701
            if(level){
3702
                level+=64;
3703
                if((level&(~127)) == 0){
3704
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3705
                }else
3706
                    bits+= esc_length;
3707
                run=0;
3708
            }else
3709
                run++;
3710
        }
3711
        i= scantable[last];
3712

    
3713
        level= temp[i] + 64;
3714

    
3715
        assert(level - 64);
3716

    
3717
        if((level&(~127)) == 0){
3718
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3719
        }else
3720
            bits+= esc_length;
3721
    }
3722

    
3723
    return bits;
3724
}
3725

    
3726
#define VSAD_INTRA(size) \
3727
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3728
    int score=0;                                                                                            \
3729
    int x,y;                                                                                                \
3730
                                                                                                            \
3731
    for(y=1; y<h; y++){                                                                                     \
3732
        for(x=0; x<size; x+=4){                                                                             \
3733
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3734
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3735
        }                                                                                                   \
3736
        s+= stride;                                                                                         \
3737
    }                                                                                                       \
3738
                                                                                                            \
3739
    return score;                                                                                           \
3740
}
3741
VSAD_INTRA(8)
3742
VSAD_INTRA(16)
3743

    
3744
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3745
    int score=0;
3746
    int x,y;
3747

    
3748
    for(y=1; y<h; y++){
3749
        for(x=0; x<16; x++){
3750
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3751
        }
3752
        s1+= stride;
3753
        s2+= stride;
3754
    }
3755

    
3756
    return score;
3757
}
3758

    
3759
#define SQ(a) ((a)*(a))
3760
#define VSSE_INTRA(size) \
3761
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3762
    int score=0;                                                                                            \
3763
    int x,y;                                                                                                \
3764
                                                                                                            \
3765
    for(y=1; y<h; y++){                                                                                     \
3766
        for(x=0; x<size; x+=4){                                                                               \
3767
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3768
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3769
        }                                                                                                   \
3770
        s+= stride;                                                                                         \
3771
    }                                                                                                       \
3772
                                                                                                            \
3773
    return score;                                                                                           \
3774
}
3775
VSSE_INTRA(8)
3776
VSSE_INTRA(16)
3777

    
3778
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3779
    int score=0;
3780
    int x,y;
3781

    
3782
    for(y=1; y<h; y++){
3783
        for(x=0; x<16; x++){
3784
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3785
        }
3786
        s1+= stride;
3787
        s2+= stride;
3788
    }
3789

    
3790
    return score;
3791
}
3792

    
3793
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3794
                               int size){
3795
    int score=0;
3796
    int i;
3797
    for(i=0; i<size; i++)
3798
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3799
    return score;
3800
}
3801

    
3802
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3803
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3804
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3805
#if CONFIG_GPL
3806
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3807
#endif
3808
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3809
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3810
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3811
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3812

    
3813
static void vector_fmul_c(float *dst, const float *src, int len){
3814
    int i;
3815
    for(i=0; i<len; i++)
3816
        dst[i] *= src[i];
3817
}
3818

    
3819
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3820
    int i;
3821
    src1 += len-1;
3822
    for(i=0; i<len; i++)
3823
        dst[i] = src0[i] * src1[-i];
3824
}
3825

    
3826
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3827
    int i;
3828
    for(i=0; i<len; i++)
3829
        dst[i] = src0[i] * src1[i] + src2[i];
3830
}
3831

    
3832
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3833
    int i,j;
3834
    dst += len;
3835
    win += len;
3836
    src0+= len;
3837
    for(i=-len, j=len-1; i<0; i++, j--) {
3838
        float s0 = src0[i];
3839
        float s1 = src1[j];
3840
        float wi = win[i];
3841
        float wj = win[j];
3842
        dst[i] = s0*wj - s1*wi + add_bias;
3843
        dst[j] = s0*wi + s1*wj + add_bias;
3844
    }
3845
}
3846

    
3847
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3848
                                 int len)
3849
{
3850
    int i;
3851
    for (i = 0; i < len; i++)
3852
        dst[i] = src[i] * mul;
3853
}
3854

    
3855
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3856
                                      const float **sv, float mul, int len)
3857
{
3858
    int i;
3859
    for (i = 0; i < len; i += 2, sv++) {
3860
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3861
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3862
    }
3863
}
3864

    
3865
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3866
                                      const float **sv, float mul, int len)
3867
{
3868
    int i;
3869
    for (i = 0; i < len; i += 4, sv++) {
3870
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3871
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3872
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3873
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3874
    }
3875
}
3876

    
3877
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3878
                               int len)
3879
{
3880
    int i;
3881
    for (i = 0; i < len; i += 2, sv++) {
3882
        dst[i  ] = sv[0][0] * mul;
3883
        dst[i+1] = sv[0][1] * mul;
3884
    }
3885
}
3886

    
3887
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3888
                               int len)
3889
{
3890
    int i;
3891
    for (i = 0; i < len; i += 4, sv++) {
3892
        dst[i  ] = sv[0][0] * mul;
3893
        dst[i+1] = sv[0][1] * mul;
3894
        dst[i+2] = sv[0][2] * mul;
3895
        dst[i+3] = sv[0][3] * mul;
3896
    }
3897
}
3898

    
3899
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3900
                                int len)
3901
{
3902
    int i;
3903
    for (i = 0; i < len; i++) {
3904
        float t = v1[i] - v2[i];
3905
        v1[i] += v2[i];
3906
        v2[i] = t;
3907
    }
3908
}
3909

    
3910
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3911
{
3912
    float p = 0.0;
3913
    int i;
3914

    
3915
    for (i = 0; i < len; i++)
3916
        p += v1[i] * v2[i];
3917

    
3918
    return p;
3919
}
3920

    
3921
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3922
    int i;
3923
    for(i=0; i<len; i++)
3924
        dst[i] = src[i] * mul;
3925
}
3926

    
3927
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3928
                   uint32_t maxi, uint32_t maxisign)
3929
{
3930

    
3931
    if(a > mini) return mini;
3932
    else if((a^(1<<31)) > maxisign) return maxi;
3933
    else return a;
3934
}
3935

    
3936
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3937
    int i;
3938
    uint32_t mini = *(uint32_t*)min;
3939
    uint32_t maxi = *(uint32_t*)max;
3940
    uint32_t maxisign = maxi ^ (1<<31);
3941
    uint32_t *dsti = (uint32_t*)dst;
3942
    const uint32_t *srci = (const uint32_t*)src;
3943
    for(i=0; i<len; i+=8) {
3944
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3945
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3946
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3947
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3948
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3949
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3950
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3951
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3952
    }
3953
}
3954
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3955
    int i;
3956
    if(min < 0 && max > 0) {
3957
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3958
    } else {
3959
        for(i=0; i < len; i+=8) {
3960
            dst[i    ] = av_clipf(src[i    ], min, max);
3961
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3962
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3963
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3964
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3965
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3966
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3967
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3968
        }
3969
    }
3970
}
3971

    
3972
static av_always_inline int float_to_int16_one(const float *src){
3973
    int_fast32_t tmp = *(const int32_t*)src;
3974
    if(tmp & 0xf0000){
3975
        tmp = (0x43c0ffff - tmp)>>31;
3976
        // is this faster on some gcc/cpu combinations?
3977
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3978
//      else                 tmp = 0;
3979
    }
3980
    return tmp - 0x8000;
3981
}
3982

    
3983
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3984
    int i;
3985
    for(i=0; i<len; i++)
3986
        dst[i] = float_to_int16_one(src+i);
3987
}
3988

    
3989
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3990
    int i,j,c;
3991
    if(channels==2){
3992
        for(i=0; i<len; i++){
3993
            dst[2*i]   = float_to_int16_one(src[0]+i);
3994
            dst[2*i+1] = float_to_int16_one(src[1]+i);
3995
        }
3996
    }else{
3997
        for(c=0; c<channels; c++)
3998
            for(i=0, j=c; i<len; i++, j+=channels)
3999
                dst[j] = float_to_int16_one(src[c]+i);
4000
    }
4001
}
4002

    
4003
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4004
{
4005
    int res = 0;
4006

    
4007
    while (order--)
4008
        res += (*v1++ * *v2++) >> shift;
4009

    
4010
    return res;
4011
}
4012

    
4013
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4014
{
4015
    int res = 0;
4016
    while (order--) {
4017
        res   += *v1 * *v2++;
4018
        *v1++ += mul * *v3++;
4019
    }
4020
    return res;
4021
}
4022

    
4023
#define W0 2048
4024
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4025
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4026
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4027
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4028
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4029
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4030
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4031

    
4032
static void wmv2_idct_row(short * b)
4033
{
4034
    int s1,s2;
4035
    int a0,a1,a2,a3,a4,a5,a6,a7;
4036
    /*step 1*/
4037
    a1 = W1*b[1]+W7*b[7];
4038
    a7 = W7*b[1]-W1*b[7];
4039
    a5 = W5*b[5]+W3*b[3];
4040
    a3 = W3*b[5]-W5*b[3];
4041
    a2 = W2*b[2]+W6*b[6];
4042
    a6 = W6*b[2]-W2*b[6];
4043
    a0 = W0*b[0]+W0*b[4];
4044
    a4 = W0*b[0]-W0*b[4];
4045
    /*step 2*/
4046
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4047
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4048
    /*step 3*/
4049
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4050
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4051
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4052
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4053
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4054
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4055
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4056
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4057
}
4058
static void wmv2_idct_col(short * b)
4059
{
4060
    int s1,s2;
4061
    int a0,a1,a2,a3,a4,a5,a6,a7;
4062
    /*step 1, with extended precision*/
4063
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4064
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4065
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4066
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4067
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4068
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4069
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4070
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4071
    /*step 2*/
4072
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4073
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4074
    /*step 3*/
4075
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4076
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4077
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4078
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4079

    
4080
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4081
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4082
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4083
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4084
}
4085
void ff_wmv2_idct_c(short * block){
4086
    int i;
4087

    
4088
    for(i=0;i<64;i+=8){
4089
        wmv2_idct_row(block+i);
4090
    }
4091
    for(i=0;i<8;i++){
4092
        wmv2_idct_col(block+i);
4093
    }
4094
}
4095
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4096
 converted */
4097
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4098
{
4099
    ff_wmv2_idct_c(block);
4100
    put_pixels_clamped_c(block, dest, line_size);
4101
}
4102
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4103
{
4104
    ff_wmv2_idct_c(block);
4105
    add_pixels_clamped_c(block, dest, line_size);
4106
}
4107
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4108
{
4109
    j_rev_dct (block);
4110
    put_pixels_clamped_c(block, dest, line_size);
4111
}
4112
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4113
{
4114
    j_rev_dct (block);
4115
    add_pixels_clamped_c(block, dest, line_size);
4116
}
4117

    
4118
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4119
{
4120
    j_rev_dct4 (block);
4121
    put_pixels_clamped4_c(block, dest, line_size);
4122
}
4123
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4124
{
4125
    j_rev_dct4 (block);
4126
    add_pixels_clamped4_c(block, dest, line_size);
4127
}
4128

    
4129
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4130
{
4131
    j_rev_dct2 (block);
4132
    put_pixels_clamped2_c(block, dest, line_size);
4133
}
4134
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4135
{
4136
    j_rev_dct2 (block);
4137
    add_pixels_clamped2_c(block, dest, line_size);
4138
}
4139

    
4140
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4141
{
4142
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4143

    
4144
    dest[0] = cm[(block[0] + 4)>>3];
4145
}
4146
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4147
{
4148
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4149

    
4150
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4151
}
4152

    
4153
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4154

    
4155
/* init static data */
4156
av_cold void dsputil_static_init(void)
4157
{
4158
    int i;
4159

    
4160
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4161
    for(i=0;i<MAX_NEG_CROP;i++) {
4162
        ff_cropTbl[i] = 0;
4163
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4164
    }
4165

    
4166
    for(i=0;i<512;i++) {
4167
        ff_squareTbl[i] = (i - 256) * (i - 256);
4168
    }
4169

    
4170
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4171
}
4172

    
4173
int ff_check_alignment(void){
4174
    static int did_fail=0;
4175
    DECLARE_ALIGNED(16, int, aligned);
4176

    
4177
    if((intptr_t)&aligned & 15){
4178
        if(!did_fail){
4179
#if HAVE_MMX || HAVE_ALTIVEC
4180
            av_log(NULL, AV_LOG_ERROR,
4181
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4182
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4183
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4184
                "Do not report crashes to FFmpeg developers.\n");
4185
#endif
4186
            did_fail=1;
4187
        }
4188
        return -1;
4189
    }
4190
    return 0;
4191
}
4192

    
4193
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4194
{
4195
    int i;
4196

    
4197
    ff_check_alignment();
4198

    
4199
#if CONFIG_ENCODERS
4200
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4201
        c->fdct = fdct_ifast;
4202
        c->fdct248 = fdct_ifast248;
4203
    }
4204
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4205
        c->fdct = ff_faandct;
4206
        c->fdct248 = ff_faandct248;
4207
    }
4208
    else {
4209
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4210
        c->fdct248 = ff_fdct248_islow;
4211
    }
4212
#endif //CONFIG_ENCODERS
4213

    
4214
    if(avctx->lowres==1){
4215
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4216
            c->idct_put= ff_jref_idct4_put;
4217
            c->idct_add= ff_jref_idct4_add;
4218
        }else{
4219
            c->idct_put= ff_h264_lowres_idct_put_c;
4220
            c->idct_add= ff_h264_lowres_idct_add_c;
4221
        }
4222
        c->idct    = j_rev_dct4;
4223
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4224
    }else if(avctx->lowres==2){
4225
        c->idct_put= ff_jref_idct2_put;
4226
        c->idct_add= ff_jref_idct2_add;
4227
        c->idct    = j_rev_dct2;
4228
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4229
    }else if(avctx->lowres==3){
4230
        c->idct_put= ff_jref_idct1_put;
4231
        c->idct_add= ff_jref_idct1_add;
4232
        c->idct    = j_rev_dct1;
4233
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4234
    }else{
4235
        if(avctx->idct_algo==FF_IDCT_INT){
4236
            c->idct_put= ff_jref_idct_put;
4237
            c->idct_add= ff_jref_idct_add;
4238
            c->idct    = j_rev_dct;
4239
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4240
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4241
                avctx->idct_algo==FF_IDCT_VP3){
4242
            c->idct_put= ff_vp3_idct_put_c;
4243
            c->idct_add= ff_vp3_idct_add_c;
4244
            c->idct    = ff_vp3_idct_c;
4245
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4246
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4247
            c->idct_put= ff_wmv2_idct_put_c;
4248
            c->idct_add= ff_wmv2_idct_add_c;
4249
            c->idct    = ff_wmv2_idct_c;
4250
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4251
        }else if(avctx->idct_alg