Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 910b9f30

History | View | Annotate | Download (161 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "mpegvideo.h"
37
#include "config.h"
38
#include "lpc.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
100
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
101
const uint32_t ff_inverse[257]={
102
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
103
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
104
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
105
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
106
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
107
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
108
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
109
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
110
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
111
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
112
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
113
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
114
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
115
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
116
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
117
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
118
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
119
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
120
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
121
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
122
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
123
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
124
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
125
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
126
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
127
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
128
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
129
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
130
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
131
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
132
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
133
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
134
  16777216
135
};
136

    
137
/* Input permutation for the simple_idct_mmx */
138
static const uint8_t simple_mmx_permutation[64]={
139
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
140
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
141
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
142
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
143
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
144
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
145
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
146
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147
};
148

    
149
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
150

    
151
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
152
    int i;
153
    int end;
154

    
155
    st->scantable= src_scantable;
156

    
157
    for(i=0; i<64; i++){
158
        int j;
159
        j = src_scantable[i];
160
        st->permutated[i] = permutation[j];
161
#if ARCH_PPC
162
        st->inverse[j] = i;
163
#endif
164
    }
165

    
166
    end=-1;
167
    for(i=0; i<64; i++){
168
        int j;
169
        j = st->permutated[i];
170
        if(j>end) end=j;
171
        st->raster_end[i]= end;
172
    }
173
}
174

    
175
static int pix_sum_c(uint8_t * pix, int line_size)
176
{
177
    int s, i, j;
178

    
179
    s = 0;
180
    for (i = 0; i < 16; i++) {
181
        for (j = 0; j < 16; j += 8) {
182
            s += pix[0];
183
            s += pix[1];
184
            s += pix[2];
185
            s += pix[3];
186
            s += pix[4];
187
            s += pix[5];
188
            s += pix[6];
189
            s += pix[7];
190
            pix += 8;
191
        }
192
        pix += line_size - 16;
193
    }
194
    return s;
195
}
196

    
197
static int pix_norm1_c(uint8_t * pix, int line_size)
198
{
199
    int s, i, j;
200
    uint32_t *sq = ff_squareTbl + 256;
201

    
202
    s = 0;
203
    for (i = 0; i < 16; i++) {
204
        for (j = 0; j < 16; j += 8) {
205
#if 0
206
            s += sq[pix[0]];
207
            s += sq[pix[1]];
208
            s += sq[pix[2]];
209
            s += sq[pix[3]];
210
            s += sq[pix[4]];
211
            s += sq[pix[5]];
212
            s += sq[pix[6]];
213
            s += sq[pix[7]];
214
#else
215
#if LONG_MAX > 2147483647
216
            register uint64_t x=*(uint64_t*)pix;
217
            s += sq[x&0xff];
218
            s += sq[(x>>8)&0xff];
219
            s += sq[(x>>16)&0xff];
220
            s += sq[(x>>24)&0xff];
221
            s += sq[(x>>32)&0xff];
222
            s += sq[(x>>40)&0xff];
223
            s += sq[(x>>48)&0xff];
224
            s += sq[(x>>56)&0xff];
225
#else
226
            register uint32_t x=*(uint32_t*)pix;
227
            s += sq[x&0xff];
228
            s += sq[(x>>8)&0xff];
229
            s += sq[(x>>16)&0xff];
230
            s += sq[(x>>24)&0xff];
231
            x=*(uint32_t*)(pix+4);
232
            s += sq[x&0xff];
233
            s += sq[(x>>8)&0xff];
234
            s += sq[(x>>16)&0xff];
235
            s += sq[(x>>24)&0xff];
236
#endif
237
#endif
238
            pix += 8;
239
        }
240
        pix += line_size - 16;
241
    }
242
    return s;
243
}
244

    
245
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
246
    int i;
247

    
248
    for(i=0; i+8<=w; i+=8){
249
        dst[i+0]= bswap_32(src[i+0]);
250
        dst[i+1]= bswap_32(src[i+1]);
251
        dst[i+2]= bswap_32(src[i+2]);
252
        dst[i+3]= bswap_32(src[i+3]);
253
        dst[i+4]= bswap_32(src[i+4]);
254
        dst[i+5]= bswap_32(src[i+5]);
255
        dst[i+6]= bswap_32(src[i+6]);
256
        dst[i+7]= bswap_32(src[i+7]);
257
    }
258
    for(;i<w; i++){
259
        dst[i+0]= bswap_32(src[i+0]);
260
    }
261
}
262

    
263
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = ff_squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[0] - pix2[0]];
271
        s += sq[pix1[1] - pix2[1]];
272
        s += sq[pix1[2] - pix2[2]];
273
        s += sq[pix1[3] - pix2[3]];
274
        pix1 += line_size;
275
        pix2 += line_size;
276
    }
277
    return s;
278
}
279

    
280
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281
{
282
    int s, i;
283
    uint32_t *sq = ff_squareTbl + 256;
284

    
285
    s = 0;
286
    for (i = 0; i < h; i++) {
287
        s += sq[pix1[0] - pix2[0]];
288
        s += sq[pix1[1] - pix2[1]];
289
        s += sq[pix1[2] - pix2[2]];
290
        s += sq[pix1[3] - pix2[3]];
291
        s += sq[pix1[4] - pix2[4]];
292
        s += sq[pix1[5] - pix2[5]];
293
        s += sq[pix1[6] - pix2[6]];
294
        s += sq[pix1[7] - pix2[7]];
295
        pix1 += line_size;
296
        pix2 += line_size;
297
    }
298
    return s;
299
}
300

    
301
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
302
{
303
    int s, i;
304
    uint32_t *sq = ff_squareTbl + 256;
305

    
306
    s = 0;
307
    for (i = 0; i < h; i++) {
308
        s += sq[pix1[ 0] - pix2[ 0]];
309
        s += sq[pix1[ 1] - pix2[ 1]];
310
        s += sq[pix1[ 2] - pix2[ 2]];
311
        s += sq[pix1[ 3] - pix2[ 3]];
312
        s += sq[pix1[ 4] - pix2[ 4]];
313
        s += sq[pix1[ 5] - pix2[ 5]];
314
        s += sq[pix1[ 6] - pix2[ 6]];
315
        s += sq[pix1[ 7] - pix2[ 7]];
316
        s += sq[pix1[ 8] - pix2[ 8]];
317
        s += sq[pix1[ 9] - pix2[ 9]];
318
        s += sq[pix1[10] - pix2[10]];
319
        s += sq[pix1[11] - pix2[11]];
320
        s += sq[pix1[12] - pix2[12]];
321
        s += sq[pix1[13] - pix2[13]];
322
        s += sq[pix1[14] - pix2[14]];
323
        s += sq[pix1[15] - pix2[15]];
324

    
325
        pix1 += line_size;
326
        pix2 += line_size;
327
    }
328
    return s;
329
}
330

    
331
/* draw the edges of width 'w' of an image of size width, height */
332
//FIXME check that this is ok for mpeg4 interlaced
333
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
334
{
335
    uint8_t *ptr, *last_line;
336
    int i;
337

    
338
    last_line = buf + (height - 1) * wrap;
339
    for(i=0;i<w;i++) {
340
        /* top and bottom */
341
        memcpy(buf - (i + 1) * wrap, buf, width);
342
        memcpy(last_line + (i + 1) * wrap, last_line, width);
343
    }
344
    /* left and right */
345
    ptr = buf;
346
    for(i=0;i<height;i++) {
347
        memset(ptr - w, ptr[0], w);
348
        memset(ptr + width, ptr[width-1], w);
349
        ptr += wrap;
350
    }
351
    /* corners */
352
    for(i=0;i<w;i++) {
353
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
354
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
355
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
356
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
357
    }
358
}
359

    
360
/**
361
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
362
 * @param buf destination buffer
363
 * @param src source buffer
364
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
365
 * @param block_w width of block
366
 * @param block_h height of block
367
 * @param src_x x coordinate of the top left sample of the block in the source buffer
368
 * @param src_y y coordinate of the top left sample of the block in the source buffer
369
 * @param w width of the source buffer
370
 * @param h height of the source buffer
371
 */
372
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
373
                                    int src_x, int src_y, int w, int h){
374
    int x, y;
375
    int start_y, start_x, end_y, end_x;
376

    
377
    if(src_y>= h){
378
        src+= (h-1-src_y)*linesize;
379
        src_y=h-1;
380
    }else if(src_y<=-block_h){
381
        src+= (1-block_h-src_y)*linesize;
382
        src_y=1-block_h;
383
    }
384
    if(src_x>= w){
385
        src+= (w-1-src_x);
386
        src_x=w-1;
387
    }else if(src_x<=-block_w){
388
        src+= (1-block_w-src_x);
389
        src_x=1-block_w;
390
    }
391

    
392
    start_y= FFMAX(0, -src_y);
393
    start_x= FFMAX(0, -src_x);
394
    end_y= FFMIN(block_h, h-src_y);
395
    end_x= FFMIN(block_w, w-src_x);
396

    
397
    // copy existing part
398
    for(y=start_y; y<end_y; y++){
399
        for(x=start_x; x<end_x; x++){
400
            buf[x + y*linesize]= src[x + y*linesize];
401
        }
402
    }
403

    
404
    //top
405
    for(y=0; y<start_y; y++){
406
        for(x=start_x; x<end_x; x++){
407
            buf[x + y*linesize]= buf[x + start_y*linesize];
408
        }
409
    }
410

    
411
    //bottom
412
    for(y=end_y; y<block_h; y++){
413
        for(x=start_x; x<end_x; x++){
414
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
415
        }
416
    }
417

    
418
    for(y=0; y<block_h; y++){
419
       //left
420
        for(x=0; x<start_x; x++){
421
            buf[x + y*linesize]= buf[start_x + y*linesize];
422
        }
423

    
424
       //right
425
        for(x=end_x; x<block_w; x++){
426
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
427
        }
428
    }
429
}
430

    
431
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
432
{
433
    int i;
434

    
435
    /* read the pixels */
436
    for(i=0;i<8;i++) {
437
        block[0] = pixels[0];
438
        block[1] = pixels[1];
439
        block[2] = pixels[2];
440
        block[3] = pixels[3];
441
        block[4] = pixels[4];
442
        block[5] = pixels[5];
443
        block[6] = pixels[6];
444
        block[7] = pixels[7];
445
        pixels += line_size;
446
        block += 8;
447
    }
448
}
449

    
450
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
451
                          const uint8_t *s2, int stride){
452
    int i;
453

    
454
    /* read the pixels */
455
    for(i=0;i<8;i++) {
456
        block[0] = s1[0] - s2[0];
457
        block[1] = s1[1] - s2[1];
458
        block[2] = s1[2] - s2[2];
459
        block[3] = s1[3] - s2[3];
460
        block[4] = s1[4] - s2[4];
461
        block[5] = s1[5] - s2[5];
462
        block[6] = s1[6] - s2[6];
463
        block[7] = s1[7] - s2[7];
464
        s1 += stride;
465
        s2 += stride;
466
        block += 8;
467
    }
468
}
469

    
470

    
471
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
472
                                 int line_size)
473
{
474
    int i;
475
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
476

    
477
    /* read the pixels */
478
    for(i=0;i<8;i++) {
479
        pixels[0] = cm[block[0]];
480
        pixels[1] = cm[block[1]];
481
        pixels[2] = cm[block[2]];
482
        pixels[3] = cm[block[3]];
483
        pixels[4] = cm[block[4]];
484
        pixels[5] = cm[block[5]];
485
        pixels[6] = cm[block[6]];
486
        pixels[7] = cm[block[7]];
487

    
488
        pixels += line_size;
489
        block += 8;
490
    }
491
}
492

    
493
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
494
                                 int line_size)
495
{
496
    int i;
497
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
498

    
499
    /* read the pixels */
500
    for(i=0;i<4;i++) {
501
        pixels[0] = cm[block[0]];
502
        pixels[1] = cm[block[1]];
503
        pixels[2] = cm[block[2]];
504
        pixels[3] = cm[block[3]];
505

    
506
        pixels += line_size;
507
        block += 8;
508
    }
509
}
510

    
511
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
512
                                 int line_size)
513
{
514
    int i;
515
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
516

    
517
    /* read the pixels */
518
    for(i=0;i<2;i++) {
519
        pixels[0] = cm[block[0]];
520
        pixels[1] = cm[block[1]];
521

    
522
        pixels += line_size;
523
        block += 8;
524
    }
525
}
526

    
527
static void put_signed_pixels_clamped_c(const DCTELEM *block,
528
                                        uint8_t *restrict pixels,
529
                                        int line_size)
530
{
531
    int i, j;
532

    
533
    for (i = 0; i < 8; i++) {
534
        for (j = 0; j < 8; j++) {
535
            if (*block < -128)
536
                *pixels = 0;
537
            else if (*block > 127)
538
                *pixels = 255;
539
            else
540
                *pixels = (uint8_t)(*block + 128);
541
            block++;
542
            pixels++;
543
        }
544
        pixels += (line_size - 8);
545
    }
546
}
547

    
548
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
549
                                    int line_size)
550
{
551
    int i;
552

    
553
    /* read the pixels */
554
    for(i=0;i<8;i++) {
555
        pixels[0] = block[0];
556
        pixels[1] = block[1];
557
        pixels[2] = block[2];
558
        pixels[3] = block[3];
559
        pixels[4] = block[4];
560
        pixels[5] = block[5];
561
        pixels[6] = block[6];
562
        pixels[7] = block[7];
563

    
564
        pixels += line_size;
565
        block += 8;
566
    }
567
}
568

    
569
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
570
                          int line_size)
571
{
572
    int i;
573
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
574

    
575
    /* read the pixels */
576
    for(i=0;i<8;i++) {
577
        pixels[0] = cm[pixels[0] + block[0]];
578
        pixels[1] = cm[pixels[1] + block[1]];
579
        pixels[2] = cm[pixels[2] + block[2]];
580
        pixels[3] = cm[pixels[3] + block[3]];
581
        pixels[4] = cm[pixels[4] + block[4]];
582
        pixels[5] = cm[pixels[5] + block[5]];
583
        pixels[6] = cm[pixels[6] + block[6]];
584
        pixels[7] = cm[pixels[7] + block[7]];
585
        pixels += line_size;
586
        block += 8;
587
    }
588
}
589

    
590
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
591
                          int line_size)
592
{
593
    int i;
594
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
595

    
596
    /* read the pixels */
597
    for(i=0;i<4;i++) {
598
        pixels[0] = cm[pixels[0] + block[0]];
599
        pixels[1] = cm[pixels[1] + block[1]];
600
        pixels[2] = cm[pixels[2] + block[2]];
601
        pixels[3] = cm[pixels[3] + block[3]];
602
        pixels += line_size;
603
        block += 8;
604
    }
605
}
606

    
607
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
608
                          int line_size)
609
{
610
    int i;
611
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
612

    
613
    /* read the pixels */
614
    for(i=0;i<2;i++) {
615
        pixels[0] = cm[pixels[0] + block[0]];
616
        pixels[1] = cm[pixels[1] + block[1]];
617
        pixels += line_size;
618
        block += 8;
619
    }
620
}
621

    
622
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
623
{
624
    int i;
625
    for(i=0;i<8;i++) {
626
        pixels[0] += block[0];
627
        pixels[1] += block[1];
628
        pixels[2] += block[2];
629
        pixels[3] += block[3];
630
        pixels[4] += block[4];
631
        pixels[5] += block[5];
632
        pixels[6] += block[6];
633
        pixels[7] += block[7];
634
        pixels += line_size;
635
        block += 8;
636
    }
637
}
638

    
639
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
640
{
641
    int i;
642
    for(i=0;i<4;i++) {
643
        pixels[0] += block[0];
644
        pixels[1] += block[1];
645
        pixels[2] += block[2];
646
        pixels[3] += block[3];
647
        pixels += line_size;
648
        block += 4;
649
    }
650
}
651

    
652
static int sum_abs_dctelem_c(DCTELEM *block)
653
{
654
    int sum=0, i;
655
    for(i=0; i<64; i++)
656
        sum+= FFABS(block[i]);
657
    return sum;
658
}
659

    
660
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
661
{
662
    int i;
663

    
664
    for (i = 0; i < h; i++) {
665
        memset(block, value, 16);
666
        block += line_size;
667
    }
668
}
669

    
670
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
671
{
672
    int i;
673

    
674
    for (i = 0; i < h; i++) {
675
        memset(block, value, 8);
676
        block += line_size;
677
    }
678
}
679

    
680
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
681
{
682
    int i, j;
683
    uint16_t *dst1 = (uint16_t *) dst;
684
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
685

    
686
    for (j = 0; j < 8; j++) {
687
        for (i = 0; i < 8; i++) {
688
            dst1[i] = dst2[i] = src[i] * 0x0101;
689
        }
690
        src  += 8;
691
        dst1 += linesize;
692
        dst2 += linesize;
693
    }
694
}
695

    
696
#if 0
697

698
#define PIXOP2(OPNAME, OP) \
699
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700
{\
701
    int i;\
702
    for(i=0; i<h; i++){\
703
        OP(*((uint64_t*)block), AV_RN64(pixels));\
704
        pixels+=line_size;\
705
        block +=line_size;\
706
    }\
707
}\
708
\
709
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
710
{\
711
    int i;\
712
    for(i=0; i<h; i++){\
713
        const uint64_t a= AV_RN64(pixels  );\
714
        const uint64_t b= AV_RN64(pixels+1);\
715
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
716
        pixels+=line_size;\
717
        block +=line_size;\
718
    }\
719
}\
720
\
721
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
722
{\
723
    int i;\
724
    for(i=0; i<h; i++){\
725
        const uint64_t a= AV_RN64(pixels  );\
726
        const uint64_t b= AV_RN64(pixels+1);\
727
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
728
        pixels+=line_size;\
729
        block +=line_size;\
730
    }\
731
}\
732
\
733
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
734
{\
735
    int i;\
736
    for(i=0; i<h; i++){\
737
        const uint64_t a= AV_RN64(pixels          );\
738
        const uint64_t b= AV_RN64(pixels+line_size);\
739
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
740
        pixels+=line_size;\
741
        block +=line_size;\
742
    }\
743
}\
744
\
745
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
746
{\
747
    int i;\
748
    for(i=0; i<h; i++){\
749
        const uint64_t a= AV_RN64(pixels          );\
750
        const uint64_t b= AV_RN64(pixels+line_size);\
751
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
752
        pixels+=line_size;\
753
        block +=line_size;\
754
    }\
755
}\
756
\
757
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
758
{\
759
        int i;\
760
        const uint64_t a= AV_RN64(pixels  );\
761
        const uint64_t b= AV_RN64(pixels+1);\
762
        uint64_t l0=  (a&0x0303030303030303ULL)\
763
                    + (b&0x0303030303030303ULL)\
764
                    + 0x0202020202020202ULL;\
765
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
767
        uint64_t l1,h1;\
768
\
769
        pixels+=line_size;\
770
        for(i=0; i<h; i+=2){\
771
            uint64_t a= AV_RN64(pixels  );\
772
            uint64_t b= AV_RN64(pixels+1);\
773
            l1=  (a&0x0303030303030303ULL)\
774
               + (b&0x0303030303030303ULL);\
775
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
776
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
777
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
778
            pixels+=line_size;\
779
            block +=line_size;\
780
            a= AV_RN64(pixels  );\
781
            b= AV_RN64(pixels+1);\
782
            l0=  (a&0x0303030303030303ULL)\
783
               + (b&0x0303030303030303ULL)\
784
               + 0x0202020202020202ULL;\
785
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
786
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
787
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
788
            pixels+=line_size;\
789
            block +=line_size;\
790
        }\
791
}\
792
\
793
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
794
{\
795
        int i;\
796
        const uint64_t a= AV_RN64(pixels  );\
797
        const uint64_t b= AV_RN64(pixels+1);\
798
        uint64_t l0=  (a&0x0303030303030303ULL)\
799
                    + (b&0x0303030303030303ULL)\
800
                    + 0x0101010101010101ULL;\
801
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
802
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
803
        uint64_t l1,h1;\
804
\
805
        pixels+=line_size;\
806
        for(i=0; i<h; i+=2){\
807
            uint64_t a= AV_RN64(pixels  );\
808
            uint64_t b= AV_RN64(pixels+1);\
809
            l1=  (a&0x0303030303030303ULL)\
810
               + (b&0x0303030303030303ULL);\
811
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
812
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
813
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
814
            pixels+=line_size;\
815
            block +=line_size;\
816
            a= AV_RN64(pixels  );\
817
            b= AV_RN64(pixels+1);\
818
            l0=  (a&0x0303030303030303ULL)\
819
               + (b&0x0303030303030303ULL)\
820
               + 0x0101010101010101ULL;\
821
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
822
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
823
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
824
            pixels+=line_size;\
825
            block +=line_size;\
826
        }\
827
}\
828
\
829
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
830
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
831
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
832
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
833
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
834
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
835
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
836

837
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
838
#else // 64 bit variant
839

    
840
#define PIXOP2(OPNAME, OP) \
841
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842
    int i;\
843
    for(i=0; i<h; i++){\
844
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
845
        pixels+=line_size;\
846
        block +=line_size;\
847
    }\
848
}\
849
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
850
    int i;\
851
    for(i=0; i<h; i++){\
852
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
853
        pixels+=line_size;\
854
        block +=line_size;\
855
    }\
856
}\
857
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
858
    int i;\
859
    for(i=0; i<h; i++){\
860
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
861
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
862
        pixels+=line_size;\
863
        block +=line_size;\
864
    }\
865
}\
866
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
867
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
868
}\
869
\
870
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
871
                                                int src_stride1, int src_stride2, int h){\
872
    int i;\
873
    for(i=0; i<h; i++){\
874
        uint32_t a,b;\
875
        a= AV_RN32(&src1[i*src_stride1  ]);\
876
        b= AV_RN32(&src2[i*src_stride2  ]);\
877
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
878
        a= AV_RN32(&src1[i*src_stride1+4]);\
879
        b= AV_RN32(&src2[i*src_stride2+4]);\
880
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
881
    }\
882
}\
883
\
884
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885
                                                int src_stride1, int src_stride2, int h){\
886
    int i;\
887
    for(i=0; i<h; i++){\
888
        uint32_t a,b;\
889
        a= AV_RN32(&src1[i*src_stride1  ]);\
890
        b= AV_RN32(&src2[i*src_stride2  ]);\
891
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
892
        a= AV_RN32(&src1[i*src_stride1+4]);\
893
        b= AV_RN32(&src2[i*src_stride2+4]);\
894
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
895
    }\
896
}\
897
\
898
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
899
                                                int src_stride1, int src_stride2, int h){\
900
    int i;\
901
    for(i=0; i<h; i++){\
902
        uint32_t a,b;\
903
        a= AV_RN32(&src1[i*src_stride1  ]);\
904
        b= AV_RN32(&src2[i*src_stride2  ]);\
905
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
906
    }\
907
}\
908
\
909
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
910
                                                int src_stride1, int src_stride2, int h){\
911
    int i;\
912
    for(i=0; i<h; i++){\
913
        uint32_t a,b;\
914
        a= AV_RN16(&src1[i*src_stride1  ]);\
915
        b= AV_RN16(&src2[i*src_stride2  ]);\
916
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
917
    }\
918
}\
919
\
920
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921
                                                int src_stride1, int src_stride2, int h){\
922
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
923
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
924
}\
925
\
926
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
927
                                                int src_stride1, int src_stride2, int h){\
928
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
929
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
930
}\
931
\
932
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
933
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
934
}\
935
\
936
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
937
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
938
}\
939
\
940
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
941
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
942
}\
943
\
944
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
945
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
946
}\
947
\
948
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
949
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950
    int i;\
951
    for(i=0; i<h; i++){\
952
        uint32_t a, b, c, d, l0, l1, h0, h1;\
953
        a= AV_RN32(&src1[i*src_stride1]);\
954
        b= AV_RN32(&src2[i*src_stride2]);\
955
        c= AV_RN32(&src3[i*src_stride3]);\
956
        d= AV_RN32(&src4[i*src_stride4]);\
957
        l0=  (a&0x03030303UL)\
958
           + (b&0x03030303UL)\
959
           + 0x02020202UL;\
960
        h0= ((a&0xFCFCFCFCUL)>>2)\
961
          + ((b&0xFCFCFCFCUL)>>2);\
962
        l1=  (c&0x03030303UL)\
963
           + (d&0x03030303UL);\
964
        h1= ((c&0xFCFCFCFCUL)>>2)\
965
          + ((d&0xFCFCFCFCUL)>>2);\
966
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967
        a= AV_RN32(&src1[i*src_stride1+4]);\
968
        b= AV_RN32(&src2[i*src_stride2+4]);\
969
        c= AV_RN32(&src3[i*src_stride3+4]);\
970
        d= AV_RN32(&src4[i*src_stride4+4]);\
971
        l0=  (a&0x03030303UL)\
972
           + (b&0x03030303UL)\
973
           + 0x02020202UL;\
974
        h0= ((a&0xFCFCFCFCUL)>>2)\
975
          + ((b&0xFCFCFCFCUL)>>2);\
976
        l1=  (c&0x03030303UL)\
977
           + (d&0x03030303UL);\
978
        h1= ((c&0xFCFCFCFCUL)>>2)\
979
          + ((d&0xFCFCFCFCUL)>>2);\
980
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981
    }\
982
}\
983
\
984
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
985
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986
}\
987
\
988
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
989
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
990
}\
991
\
992
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
993
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
994
}\
995
\
996
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
997
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998
}\
999
\
1000
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1001
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1002
    int i;\
1003
    for(i=0; i<h; i++){\
1004
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1005
        a= AV_RN32(&src1[i*src_stride1]);\
1006
        b= AV_RN32(&src2[i*src_stride2]);\
1007
        c= AV_RN32(&src3[i*src_stride3]);\
1008
        d= AV_RN32(&src4[i*src_stride4]);\
1009
        l0=  (a&0x03030303UL)\
1010
           + (b&0x03030303UL)\
1011
           + 0x01010101UL;\
1012
        h0= ((a&0xFCFCFCFCUL)>>2)\
1013
          + ((b&0xFCFCFCFCUL)>>2);\
1014
        l1=  (c&0x03030303UL)\
1015
           + (d&0x03030303UL);\
1016
        h1= ((c&0xFCFCFCFCUL)>>2)\
1017
          + ((d&0xFCFCFCFCUL)>>2);\
1018
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1019
        a= AV_RN32(&src1[i*src_stride1+4]);\
1020
        b= AV_RN32(&src2[i*src_stride2+4]);\
1021
        c= AV_RN32(&src3[i*src_stride3+4]);\
1022
        d= AV_RN32(&src4[i*src_stride4+4]);\
1023
        l0=  (a&0x03030303UL)\
1024
           + (b&0x03030303UL)\
1025
           + 0x01010101UL;\
1026
        h0= ((a&0xFCFCFCFCUL)>>2)\
1027
          + ((b&0xFCFCFCFCUL)>>2);\
1028
        l1=  (c&0x03030303UL)\
1029
           + (d&0x03030303UL);\
1030
        h1= ((c&0xFCFCFCFCUL)>>2)\
1031
          + ((d&0xFCFCFCFCUL)>>2);\
1032
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1033
    }\
1034
}\
1035
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1036
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1037
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1038
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039
}\
1040
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1041
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1042
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1043
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044
}\
1045
\
1046
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1047
{\
1048
        int i, a0, b0, a1, b1;\
1049
        a0= pixels[0];\
1050
        b0= pixels[1] + 2;\
1051
        a0 += b0;\
1052
        b0 += pixels[2];\
1053
\
1054
        pixels+=line_size;\
1055
        for(i=0; i<h; i+=2){\
1056
            a1= pixels[0];\
1057
            b1= pixels[1];\
1058
            a1 += b1;\
1059
            b1 += pixels[2];\
1060
\
1061
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1062
            block[1]= (b1+b0)>>2;\
1063
\
1064
            pixels+=line_size;\
1065
            block +=line_size;\
1066
\
1067
            a0= pixels[0];\
1068
            b0= pixels[1] + 2;\
1069
            a0 += b0;\
1070
            b0 += pixels[2];\
1071
\
1072
            block[0]= (a1+a0)>>2;\
1073
            block[1]= (b1+b0)>>2;\
1074
            pixels+=line_size;\
1075
            block +=line_size;\
1076
        }\
1077
}\
1078
\
1079
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080
{\
1081
        int i;\
1082
        const uint32_t a= AV_RN32(pixels  );\
1083
        const uint32_t b= AV_RN32(pixels+1);\
1084
        uint32_t l0=  (a&0x03030303UL)\
1085
                    + (b&0x03030303UL)\
1086
                    + 0x02020202UL;\
1087
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088
                   + ((b&0xFCFCFCFCUL)>>2);\
1089
        uint32_t l1,h1;\
1090
\
1091
        pixels+=line_size;\
1092
        for(i=0; i<h; i+=2){\
1093
            uint32_t a= AV_RN32(pixels  );\
1094
            uint32_t b= AV_RN32(pixels+1);\
1095
            l1=  (a&0x03030303UL)\
1096
               + (b&0x03030303UL);\
1097
            h1= ((a&0xFCFCFCFCUL)>>2)\
1098
              + ((b&0xFCFCFCFCUL)>>2);\
1099
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100
            pixels+=line_size;\
1101
            block +=line_size;\
1102
            a= AV_RN32(pixels  );\
1103
            b= AV_RN32(pixels+1);\
1104
            l0=  (a&0x03030303UL)\
1105
               + (b&0x03030303UL)\
1106
               + 0x02020202UL;\
1107
            h0= ((a&0xFCFCFCFCUL)>>2)\
1108
              + ((b&0xFCFCFCFCUL)>>2);\
1109
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110
            pixels+=line_size;\
1111
            block +=line_size;\
1112
        }\
1113
}\
1114
\
1115
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1116
{\
1117
    int j;\
1118
    for(j=0; j<2; j++){\
1119
        int i;\
1120
        const uint32_t a= AV_RN32(pixels  );\
1121
        const uint32_t b= AV_RN32(pixels+1);\
1122
        uint32_t l0=  (a&0x03030303UL)\
1123
                    + (b&0x03030303UL)\
1124
                    + 0x02020202UL;\
1125
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1126
                   + ((b&0xFCFCFCFCUL)>>2);\
1127
        uint32_t l1,h1;\
1128
\
1129
        pixels+=line_size;\
1130
        for(i=0; i<h; i+=2){\
1131
            uint32_t a= AV_RN32(pixels  );\
1132
            uint32_t b= AV_RN32(pixels+1);\
1133
            l1=  (a&0x03030303UL)\
1134
               + (b&0x03030303UL);\
1135
            h1= ((a&0xFCFCFCFCUL)>>2)\
1136
              + ((b&0xFCFCFCFCUL)>>2);\
1137
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1138
            pixels+=line_size;\
1139
            block +=line_size;\
1140
            a= AV_RN32(pixels  );\
1141
            b= AV_RN32(pixels+1);\
1142
            l0=  (a&0x03030303UL)\
1143
               + (b&0x03030303UL)\
1144
               + 0x02020202UL;\
1145
            h0= ((a&0xFCFCFCFCUL)>>2)\
1146
              + ((b&0xFCFCFCFCUL)>>2);\
1147
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148
            pixels+=line_size;\
1149
            block +=line_size;\
1150
        }\
1151
        pixels+=4-line_size*(h+1);\
1152
        block +=4-line_size*h;\
1153
    }\
1154
}\
1155
\
1156
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1157
{\
1158
    int j;\
1159
    for(j=0; j<2; j++){\
1160
        int i;\
1161
        const uint32_t a= AV_RN32(pixels  );\
1162
        const uint32_t b= AV_RN32(pixels+1);\
1163
        uint32_t l0=  (a&0x03030303UL)\
1164
                    + (b&0x03030303UL)\
1165
                    + 0x01010101UL;\
1166
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1167
                   + ((b&0xFCFCFCFCUL)>>2);\
1168
        uint32_t l1,h1;\
1169
\
1170
        pixels+=line_size;\
1171
        for(i=0; i<h; i+=2){\
1172
            uint32_t a= AV_RN32(pixels  );\
1173
            uint32_t b= AV_RN32(pixels+1);\
1174
            l1=  (a&0x03030303UL)\
1175
               + (b&0x03030303UL);\
1176
            h1= ((a&0xFCFCFCFCUL)>>2)\
1177
              + ((b&0xFCFCFCFCUL)>>2);\
1178
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1179
            pixels+=line_size;\
1180
            block +=line_size;\
1181
            a= AV_RN32(pixels  );\
1182
            b= AV_RN32(pixels+1);\
1183
            l0=  (a&0x03030303UL)\
1184
               + (b&0x03030303UL)\
1185
               + 0x01010101UL;\
1186
            h0= ((a&0xFCFCFCFCUL)>>2)\
1187
              + ((b&0xFCFCFCFCUL)>>2);\
1188
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189
            pixels+=line_size;\
1190
            block +=line_size;\
1191
        }\
1192
        pixels+=4-line_size*(h+1);\
1193
        block +=4-line_size*h;\
1194
    }\
1195
}\
1196
\
1197
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1198
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1199
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1200
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1201
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1202
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1203
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1204
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1205

    
1206
#define op_avg(a, b) a = rnd_avg32(a, b)
1207
#endif
1208
#define op_put(a, b) a = b
1209

    
1210
PIXOP2(avg, op_avg)
1211
PIXOP2(put, op_put)
1212
#undef op_avg
1213
#undef op_put
1214

    
1215
#define avg2(a,b) ((a+b+1)>>1)
1216
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1217

    
1218
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1219
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1220
}
1221

    
1222
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1223
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1224
}
1225

    
1226
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1227
{
1228
    const int A=(16-x16)*(16-y16);
1229
    const int B=(   x16)*(16-y16);
1230
    const int C=(16-x16)*(   y16);
1231
    const int D=(   x16)*(   y16);
1232
    int i;
1233

    
1234
    for(i=0; i<h; i++)
1235
    {
1236
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1237
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1238
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1239
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1240
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1241
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1242
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1243
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1244
        dst+= stride;
1245
        src+= stride;
1246
    }
1247
}
1248

    
1249
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1250
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1251
{
1252
    int y, vx, vy;
1253
    const int s= 1<<shift;
1254

    
1255
    width--;
1256
    height--;
1257

    
1258
    for(y=0; y<h; y++){
1259
        int x;
1260

    
1261
        vx= ox;
1262
        vy= oy;
1263
        for(x=0; x<8; x++){ //XXX FIXME optimize
1264
            int src_x, src_y, frac_x, frac_y, index;
1265

    
1266
            src_x= vx>>16;
1267
            src_y= vy>>16;
1268
            frac_x= src_x&(s-1);
1269
            frac_y= src_y&(s-1);
1270
            src_x>>=shift;
1271
            src_y>>=shift;
1272

    
1273
            if((unsigned)src_x < width){
1274
                if((unsigned)src_y < height){
1275
                    index= src_x + src_y*stride;
1276
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1277
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1278
                                        + (  src[index+stride  ]*(s-frac_x)
1279
                                           + src[index+stride+1]*   frac_x )*   frac_y
1280
                                        + r)>>(shift*2);
1281
                }else{
1282
                    index= src_x + av_clip(src_y, 0, height)*stride;
1283
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1284
                                          + src[index       +1]*   frac_x )*s
1285
                                        + r)>>(shift*2);
1286
                }
1287
            }else{
1288
                if((unsigned)src_y < height){
1289
                    index= av_clip(src_x, 0, width) + src_y*stride;
1290
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1291
                                           + src[index+stride  ]*   frac_y )*s
1292
                                        + r)>>(shift*2);
1293
                }else{
1294
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1295
                    dst[y*stride + x]=    src[index         ];
1296
                }
1297
            }
1298

    
1299
            vx+= dxx;
1300
            vy+= dyx;
1301
        }
1302
        ox += dxy;
1303
        oy += dyy;
1304
    }
1305
}
1306

    
1307
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308
    switch(width){
1309
    case 2: put_pixels2_c (dst, src, stride, height); break;
1310
    case 4: put_pixels4_c (dst, src, stride, height); break;
1311
    case 8: put_pixels8_c (dst, src, stride, height); break;
1312
    case 16:put_pixels16_c(dst, src, stride, height); break;
1313
    }
1314
}
1315

    
1316
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317
    int i,j;
1318
    for (i=0; i < height; i++) {
1319
      for (j=0; j < width; j++) {
1320
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1321
      }
1322
      src += stride;
1323
      dst += stride;
1324
    }
1325
}
1326

    
1327
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328
    int i,j;
1329
    for (i=0; i < height; i++) {
1330
      for (j=0; j < width; j++) {
1331
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1332
      }
1333
      src += stride;
1334
      dst += stride;
1335
    }
1336
}
1337

    
1338
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339
    int i,j;
1340
    for (i=0; i < height; i++) {
1341
      for (j=0; j < width; j++) {
1342
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1343
      }
1344
      src += stride;
1345
      dst += stride;
1346
    }
1347
}
1348

    
1349
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350
    int i,j;
1351
    for (i=0; i < height; i++) {
1352
      for (j=0; j < width; j++) {
1353
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1354
      }
1355
      src += stride;
1356
      dst += stride;
1357
    }
1358
}
1359

    
1360
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361
    int i,j;
1362
    for (i=0; i < height; i++) {
1363
      for (j=0; j < width; j++) {
1364
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365
      }
1366
      src += stride;
1367
      dst += stride;
1368
    }
1369
}
1370

    
1371
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372
    int i,j;
1373
    for (i=0; i < height; i++) {
1374
      for (j=0; j < width; j++) {
1375
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1376
      }
1377
      src += stride;
1378
      dst += stride;
1379
    }
1380
}
1381

    
1382
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383
    int i,j;
1384
    for (i=0; i < height; i++) {
1385
      for (j=0; j < width; j++) {
1386
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1387
      }
1388
      src += stride;
1389
      dst += stride;
1390
    }
1391
}
1392

    
1393
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394
    int i,j;
1395
    for (i=0; i < height; i++) {
1396
      for (j=0; j < width; j++) {
1397
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1398
      }
1399
      src += stride;
1400
      dst += stride;
1401
    }
1402
}
1403

    
1404
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405
    switch(width){
1406
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1407
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1408
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1409
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1410
    }
1411
}
1412

    
1413
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414
    int i,j;
1415
    for (i=0; i < height; i++) {
1416
      for (j=0; j < width; j++) {
1417
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1418
      }
1419
      src += stride;
1420
      dst += stride;
1421
    }
1422
}
1423

    
1424
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425
    int i,j;
1426
    for (i=0; i < height; i++) {
1427
      for (j=0; j < width; j++) {
1428
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1429
      }
1430
      src += stride;
1431
      dst += stride;
1432
    }
1433
}
1434

    
1435
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436
    int i,j;
1437
    for (i=0; i < height; i++) {
1438
      for (j=0; j < width; j++) {
1439
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1440
      }
1441
      src += stride;
1442
      dst += stride;
1443
    }
1444
}
1445

    
1446
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447
    int i,j;
1448
    for (i=0; i < height; i++) {
1449
      for (j=0; j < width; j++) {
1450
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451
      }
1452
      src += stride;
1453
      dst += stride;
1454
    }
1455
}
1456

    
1457
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458
    int i,j;
1459
    for (i=0; i < height; i++) {
1460
      for (j=0; j < width; j++) {
1461
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462
      }
1463
      src += stride;
1464
      dst += stride;
1465
    }
1466
}
1467

    
1468
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469
    int i,j;
1470
    for (i=0; i < height; i++) {
1471
      for (j=0; j < width; j++) {
1472
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1473
      }
1474
      src += stride;
1475
      dst += stride;
1476
    }
1477
}
1478

    
1479
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480
    int i,j;
1481
    for (i=0; i < height; i++) {
1482
      for (j=0; j < width; j++) {
1483
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1484
      }
1485
      src += stride;
1486
      dst += stride;
1487
    }
1488
}
1489

    
1490
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491
    int i,j;
1492
    for (i=0; i < height; i++) {
1493
      for (j=0; j < width; j++) {
1494
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1495
      }
1496
      src += stride;
1497
      dst += stride;
1498
    }
1499
}
1500
#if 0
1501
#define TPEL_WIDTH(width)\
1502
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1503
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1504
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1505
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1506
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1507
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1508
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1509
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1510
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1511
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1512
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1513
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1514
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1515
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1516
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1517
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1518
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1519
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1520
#endif
1521

    
1522
#define H264_CHROMA_MC(OPNAME, OP)\
1523
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524
    const int A=(8-x)*(8-y);\
1525
    const int B=(  x)*(8-y);\
1526
    const int C=(8-x)*(  y);\
1527
    const int D=(  x)*(  y);\
1528
    int i;\
1529
    \
1530
    assert(x<8 && y<8 && x>=0 && y>=0);\
1531
\
1532
    if(D){\
1533
        for(i=0; i<h; i++){\
1534
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536
            dst+= stride;\
1537
            src+= stride;\
1538
        }\
1539
    }else{\
1540
        const int E= B+C;\
1541
        const int step= C ? stride : 1;\
1542
        for(i=0; i<h; i++){\
1543
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1544
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1545
            dst+= stride;\
1546
            src+= stride;\
1547
        }\
1548
    }\
1549
}\
1550
\
1551
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1552
    const int A=(8-x)*(8-y);\
1553
    const int B=(  x)*(8-y);\
1554
    const int C=(8-x)*(  y);\
1555
    const int D=(  x)*(  y);\
1556
    int i;\
1557
    \
1558
    assert(x<8 && y<8 && x>=0 && y>=0);\
1559
\
1560
    if(D){\
1561
        for(i=0; i<h; i++){\
1562
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1563
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1564
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1565
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1566
            dst+= stride;\
1567
            src+= stride;\
1568
        }\
1569
    }else{\
1570
        const int E= B+C;\
1571
        const int step= C ? stride : 1;\
1572
        for(i=0; i<h; i++){\
1573
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1574
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1575
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1576
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1577
            dst+= stride;\
1578
            src+= stride;\
1579
        }\
1580
    }\
1581
}\
1582
\
1583
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1584
    const int A=(8-x)*(8-y);\
1585
    const int B=(  x)*(8-y);\
1586
    const int C=(8-x)*(  y);\
1587
    const int D=(  x)*(  y);\
1588
    int i;\
1589
    \
1590
    assert(x<8 && y<8 && x>=0 && y>=0);\
1591
\
1592
    if(D){\
1593
        for(i=0; i<h; i++){\
1594
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1595
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1596
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1597
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1598
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1599
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1600
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1601
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1602
            dst+= stride;\
1603
            src+= stride;\
1604
        }\
1605
    }else{\
1606
        const int E= B+C;\
1607
        const int step= C ? stride : 1;\
1608
        for(i=0; i<h; i++){\
1609
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1610
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1611
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1612
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1613
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1614
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1615
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1616
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1617
            dst+= stride;\
1618
            src+= stride;\
1619
        }\
1620
    }\
1621
}
1622

    
1623
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1624
#define op_put(a, b) a = (((b) + 32)>>6)
1625

    
1626
H264_CHROMA_MC(put_       , op_put)
1627
H264_CHROMA_MC(avg_       , op_avg)
1628
#undef op_avg
1629
#undef op_put
1630

    
1631
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1632
    const int A=(8-x)*(8-y);
1633
    const int B=(  x)*(8-y);
1634
    const int C=(8-x)*(  y);
1635
    const int D=(  x)*(  y);
1636
    int i;
1637

    
1638
    assert(x<8 && y<8 && x>=0 && y>=0);
1639

    
1640
    for(i=0; i<h; i++)
1641
    {
1642
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1643
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1644
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1645
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1646
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1647
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1648
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1649
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1650
        dst+= stride;
1651
        src+= stride;
1652
    }
1653
}
1654

    
1655
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1656
    const int A=(8-x)*(8-y);
1657
    const int B=(  x)*(8-y);
1658
    const int C=(8-x)*(  y);
1659
    const int D=(  x)*(  y);
1660
    int i;
1661

    
1662
    assert(x<8 && y<8 && x>=0 && y>=0);
1663

    
1664
    for(i=0; i<h; i++)
1665
    {
1666
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1667
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1668
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1669
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1670
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1671
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1672
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1673
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1674
        dst+= stride;
1675
        src+= stride;
1676
    }
1677
}
1678

    
1679
#define QPEL_MC(r, OPNAME, RND, OP) \
1680
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1681
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682
    int i;\
1683
    for(i=0; i<h; i++)\
1684
    {\
1685
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1686
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1687
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1688
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1689
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1690
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1691
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1692
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1693
        dst+=dstStride;\
1694
        src+=srcStride;\
1695
    }\
1696
}\
1697
\
1698
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1699
    const int w=8;\
1700
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1701
    int i;\
1702
    for(i=0; i<w; i++)\
1703
    {\
1704
        const int src0= src[0*srcStride];\
1705
        const int src1= src[1*srcStride];\
1706
        const int src2= src[2*srcStride];\
1707
        const int src3= src[3*srcStride];\
1708
        const int src4= src[4*srcStride];\
1709
        const int src5= src[5*srcStride];\
1710
        const int src6= src[6*srcStride];\
1711
        const int src7= src[7*srcStride];\
1712
        const int src8= src[8*srcStride];\
1713
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1714
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1715
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1716
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1717
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1718
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1719
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1720
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1721
        dst++;\
1722
        src++;\
1723
    }\
1724
}\
1725
\
1726
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1727
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728
    int i;\
1729
    \
1730
    for(i=0; i<h; i++)\
1731
    {\
1732
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1733
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1734
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1735
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1736
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1737
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1738
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1739
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1740
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1741
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1742
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1743
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1744
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1745
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1746
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1747
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1748
        dst+=dstStride;\
1749
        src+=srcStride;\
1750
    }\
1751
}\
1752
\
1753
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755
    int i;\
1756
    const int w=16;\
1757
    for(i=0; i<w; i++)\
1758
    {\
1759
        const int src0= src[0*srcStride];\
1760
        const int src1= src[1*srcStride];\
1761
        const int src2= src[2*srcStride];\
1762
        const int src3= src[3*srcStride];\
1763
        const int src4= src[4*srcStride];\
1764
        const int src5= src[5*srcStride];\
1765
        const int src6= src[6*srcStride];\
1766
        const int src7= src[7*srcStride];\
1767
        const int src8= src[8*srcStride];\
1768
        const int src9= src[9*srcStride];\
1769
        const int src10= src[10*srcStride];\
1770
        const int src11= src[11*srcStride];\
1771
        const int src12= src[12*srcStride];\
1772
        const int src13= src[13*srcStride];\
1773
        const int src14= src[14*srcStride];\
1774
        const int src15= src[15*srcStride];\
1775
        const int src16= src[16*srcStride];\
1776
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1777
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1778
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1779
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1780
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1781
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1782
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1783
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1784
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1785
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1786
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1787
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1788
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1789
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1790
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1791
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1792
        dst++;\
1793
        src++;\
1794
    }\
1795
}\
1796
\
1797
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1798
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1799
}\
1800
\
1801
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t half[64];\
1803
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1804
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1805
}\
1806
\
1807
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1808
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1809
}\
1810
\
1811
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t half[64];\
1813
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1814
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1815
}\
1816
\
1817
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1818
    uint8_t full[16*9];\
1819
    uint8_t half[64];\
1820
    copy_block9(full, src, 16, stride, 9);\
1821
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1822
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1823
}\
1824
\
1825
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1826
    uint8_t full[16*9];\
1827
    copy_block9(full, src, 16, stride, 9);\
1828
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1829
}\
1830
\
1831
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1832
    uint8_t full[16*9];\
1833
    uint8_t half[64];\
1834
    copy_block9(full, src, 16, stride, 9);\
1835
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1836
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1837
}\
1838
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1839
    uint8_t full[16*9];\
1840
    uint8_t halfH[72];\
1841
    uint8_t halfV[64];\
1842
    uint8_t halfHV[64];\
1843
    copy_block9(full, src, 16, stride, 9);\
1844
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1846
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1848
}\
1849
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1850
    uint8_t full[16*9];\
1851
    uint8_t halfH[72];\
1852
    uint8_t halfHV[64];\
1853
    copy_block9(full, src, 16, stride, 9);\
1854
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1856
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1858
}\
1859
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[16*9];\
1861
    uint8_t halfH[72];\
1862
    uint8_t halfV[64];\
1863
    uint8_t halfHV[64];\
1864
    copy_block9(full, src, 16, stride, 9);\
1865
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1867
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1869
}\
1870
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1871
    uint8_t full[16*9];\
1872
    uint8_t halfH[72];\
1873
    uint8_t halfHV[64];\
1874
    copy_block9(full, src, 16, stride, 9);\
1875
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1877
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1879
}\
1880
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1881
    uint8_t full[16*9];\
1882
    uint8_t halfH[72];\
1883
    uint8_t halfV[64];\
1884
    uint8_t halfHV[64];\
1885
    copy_block9(full, src, 16, stride, 9);\
1886
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1887
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1888
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1890
}\
1891
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1892
    uint8_t full[16*9];\
1893
    uint8_t halfH[72];\
1894
    uint8_t halfHV[64];\
1895
    copy_block9(full, src, 16, stride, 9);\
1896
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1898
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1899
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1900
}\
1901
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1902
    uint8_t full[16*9];\
1903
    uint8_t halfH[72];\
1904
    uint8_t halfV[64];\
1905
    uint8_t halfHV[64];\
1906
    copy_block9(full, src, 16, stride, 9);\
1907
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1908
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1909
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1911
}\
1912
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1913
    uint8_t full[16*9];\
1914
    uint8_t halfH[72];\
1915
    uint8_t halfHV[64];\
1916
    copy_block9(full, src, 16, stride, 9);\
1917
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1918
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1919
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1920
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1921
}\
1922
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1923
    uint8_t halfH[72];\
1924
    uint8_t halfHV[64];\
1925
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1926
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1928
}\
1929
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1930
    uint8_t halfH[72];\
1931
    uint8_t halfHV[64];\
1932
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1933
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1934
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1935
}\
1936
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1937
    uint8_t full[16*9];\
1938
    uint8_t halfH[72];\
1939
    uint8_t halfV[64];\
1940
    uint8_t halfHV[64];\
1941
    copy_block9(full, src, 16, stride, 9);\
1942
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1943
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1944
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1946
}\
1947
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1948
    uint8_t full[16*9];\
1949
    uint8_t halfH[72];\
1950
    copy_block9(full, src, 16, stride, 9);\
1951
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1954
}\
1955
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956
    uint8_t full[16*9];\
1957
    uint8_t halfH[72];\
1958
    uint8_t halfV[64];\
1959
    uint8_t halfHV[64];\
1960
    copy_block9(full, src, 16, stride, 9);\
1961
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1962
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1965
}\
1966
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1967
    uint8_t full[16*9];\
1968
    uint8_t halfH[72];\
1969
    copy_block9(full, src, 16, stride, 9);\
1970
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1973
}\
1974
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1975
    uint8_t halfH[72];\
1976
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1978
}\
1979
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1980
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1981
}\
1982
\
1983
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1984
    uint8_t half[256];\
1985
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1986
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1987
}\
1988
\
1989
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1990
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1991
}\
1992
\
1993
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t half[256];\
1995
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1996
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1997
}\
1998
\
1999
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2000
    uint8_t full[24*17];\
2001
    uint8_t half[256];\
2002
    copy_block17(full, src, 24, stride, 17);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2004
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2005
}\
2006
\
2007
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2008
    uint8_t full[24*17];\
2009
    copy_block17(full, src, 24, stride, 17);\
2010
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2011
}\
2012
\
2013
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2014
    uint8_t full[24*17];\
2015
    uint8_t half[256];\
2016
    copy_block17(full, src, 24, stride, 17);\
2017
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2018
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2019
}\
2020
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021
    uint8_t full[24*17];\
2022
    uint8_t halfH[272];\
2023
    uint8_t halfV[256];\
2024
    uint8_t halfHV[256];\
2025
    copy_block17(full, src, 24, stride, 17);\
2026
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030
}\
2031
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2032
    uint8_t full[24*17];\
2033
    uint8_t halfH[272];\
2034
    uint8_t halfHV[256];\
2035
    copy_block17(full, src, 24, stride, 17);\
2036
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2040
}\
2041
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[24*17];\
2043
    uint8_t halfH[272];\
2044
    uint8_t halfV[256];\
2045
    uint8_t halfHV[256];\
2046
    copy_block17(full, src, 24, stride, 17);\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051
}\
2052
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2053
    uint8_t full[24*17];\
2054
    uint8_t halfH[272];\
2055
    uint8_t halfHV[256];\
2056
    copy_block17(full, src, 24, stride, 17);\
2057
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2061
}\
2062
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2063
    uint8_t full[24*17];\
2064
    uint8_t halfH[272];\
2065
    uint8_t halfV[256];\
2066
    uint8_t halfHV[256];\
2067
    copy_block17(full, src, 24, stride, 17);\
2068
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2069
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2070
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2071
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2072
}\
2073
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2074
    uint8_t full[24*17];\
2075
    uint8_t halfH[272];\
2076
    uint8_t halfHV[256];\
2077
    copy_block17(full, src, 24, stride, 17);\
2078
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2079
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2080
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2082
}\
2083
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084
    uint8_t full[24*17];\
2085
    uint8_t halfH[272];\
2086
    uint8_t halfV[256];\
2087
    uint8_t halfHV[256];\
2088
    copy_block17(full, src, 24, stride, 17);\
2089
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2090
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2091
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2093
}\
2094
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2095
    uint8_t full[24*17];\
2096
    uint8_t halfH[272];\
2097
    uint8_t halfHV[256];\
2098
    copy_block17(full, src, 24, stride, 17);\
2099
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2100
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2101
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2102
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2103
}\
2104
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2105
    uint8_t halfH[272];\
2106
    uint8_t halfHV[256];\
2107
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2108
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2110
}\
2111
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2112
    uint8_t halfH[272];\
2113
    uint8_t halfHV[256];\
2114
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2115
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2116
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2117
}\
2118
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2119
    uint8_t full[24*17];\
2120
    uint8_t halfH[272];\
2121
    uint8_t halfV[256];\
2122
    uint8_t halfHV[256];\
2123
    copy_block17(full, src, 24, stride, 17);\
2124
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2125
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2126
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2128
}\
2129
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2130
    uint8_t full[24*17];\
2131
    uint8_t halfH[272];\
2132
    copy_block17(full, src, 24, stride, 17);\
2133
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2136
}\
2137
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138
    uint8_t full[24*17];\
2139
    uint8_t halfH[272];\
2140
    uint8_t halfV[256];\
2141
    uint8_t halfHV[256];\
2142
    copy_block17(full, src, 24, stride, 17);\
2143
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2144
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2147
}\
2148
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2149
    uint8_t full[24*17];\
2150
    uint8_t halfH[272];\
2151
    copy_block17(full, src, 24, stride, 17);\
2152
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2155
}\
2156
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2157
    uint8_t halfH[272];\
2158
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2160
}
2161

    
2162
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2163
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2164
#define op_put(a, b) a = cm[((b) + 16)>>5]
2165
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2166

    
2167
QPEL_MC(0, put_       , _       , op_put)
2168
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2169
QPEL_MC(0, avg_       , _       , op_avg)
2170
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2171
#undef op_avg
2172
#undef op_avg_no_rnd
2173
#undef op_put
2174
#undef op_put_no_rnd
2175

    
2176
#if 1
2177
#define H264_LOWPASS(OPNAME, OP, OP2) \
2178
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2179
    const int h=2;\
2180
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181
    int i;\
2182
    for(i=0; i<h; i++)\
2183
    {\
2184
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2185
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2186
        dst+=dstStride;\
2187
        src+=srcStride;\
2188
    }\
2189
}\
2190
\
2191
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2192
    const int w=2;\
2193
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2194
    int i;\
2195
    for(i=0; i<w; i++)\
2196
    {\
2197
        const int srcB= src[-2*srcStride];\
2198
        const int srcA= src[-1*srcStride];\
2199
        const int src0= src[0 *srcStride];\
2200
        const int src1= src[1 *srcStride];\
2201
        const int src2= src[2 *srcStride];\
2202
        const int src3= src[3 *srcStride];\
2203
        const int src4= src[4 *srcStride];\
2204
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2205
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2206
        dst++;\
2207
        src++;\
2208
    }\
2209
}\
2210
\
2211
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212
    const int h=2;\
2213
    const int w=2;\
2214
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215
    int i;\
2216
    src -= 2*srcStride;\
2217
    for(i=0; i<h+5; i++)\
2218
    {\
2219
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2220
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2221
        tmp+=tmpStride;\
2222
        src+=srcStride;\
2223
    }\
2224
    tmp -= tmpStride*(h+5-2);\
2225
    for(i=0; i<w; i++)\
2226
    {\
2227
        const int tmpB= tmp[-2*tmpStride];\
2228
        const int tmpA= tmp[-1*tmpStride];\
2229
        const int tmp0= tmp[0 *tmpStride];\
2230
        const int tmp1= tmp[1 *tmpStride];\
2231
        const int tmp2= tmp[2 *tmpStride];\
2232
        const int tmp3= tmp[3 *tmpStride];\
2233
        const int tmp4= tmp[4 *tmpStride];\
2234
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2235
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2236
        dst++;\
2237
        tmp++;\
2238
    }\
2239
}\
2240
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241
    const int h=4;\
2242
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2243
    int i;\
2244
    for(i=0; i<h; i++)\
2245
    {\
2246
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2247
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2248
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2249
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2250
        dst+=dstStride;\
2251
        src+=srcStride;\
2252
    }\
2253
}\
2254
\
2255
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2256
    const int w=4;\
2257
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2258
    int i;\
2259
    for(i=0; i<w; i++)\
2260
    {\
2261
        const int srcB= src[-2*srcStride];\
2262
        const int srcA= src[-1*srcStride];\
2263
        const int src0= src[0 *srcStride];\
2264
        const int src1= src[1 *srcStride];\
2265
        const int src2= src[2 *srcStride];\
2266
        const int src3= src[3 *srcStride];\
2267
        const int src4= src[4 *srcStride];\
2268
        const int src5= src[5 *srcStride];\
2269
        const int src6= src[6 *srcStride];\
2270
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2271
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2272
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2273
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2274
        dst++;\
2275
        src++;\
2276
    }\
2277
}\
2278
\
2279
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2280
    const int h=4;\
2281
    const int w=4;\
2282
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283
    int i;\
2284
    src -= 2*srcStride;\
2285
    for(i=0; i<h+5; i++)\
2286
    {\
2287
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2288
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2289
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2290
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2291
        tmp+=tmpStride;\
2292
        src+=srcStride;\
2293
    }\
2294
    tmp -= tmpStride*(h+5-2);\
2295
    for(i=0; i<w; i++)\
2296
    {\
2297
        const int tmpB= tmp[-2*tmpStride];\
2298
        const int tmpA= tmp[-1*tmpStride];\
2299
        const int tmp0= tmp[0 *tmpStride];\
2300
        const int tmp1= tmp[1 *tmpStride];\
2301
        const int tmp2= tmp[2 *tmpStride];\
2302
        const int tmp3= tmp[3 *tmpStride];\
2303
        const int tmp4= tmp[4 *tmpStride];\
2304
        const int tmp5= tmp[5 *tmpStride];\
2305
        const int tmp6= tmp[6 *tmpStride];\
2306
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2307
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2308
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2309
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2310
        dst++;\
2311
        tmp++;\
2312
    }\
2313
}\
2314
\
2315
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2316
    const int h=8;\
2317
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2318
    int i;\
2319
    for(i=0; i<h; i++)\
2320
    {\
2321
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2322
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2323
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2324
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2325
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2326
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2327
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2328
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2329
        dst+=dstStride;\
2330
        src+=srcStride;\
2331
    }\
2332
}\
2333
\
2334
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2335
    const int w=8;\
2336
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337
    int i;\
2338
    for(i=0; i<w; i++)\
2339
    {\
2340
        const int srcB= src[-2*srcStride];\
2341
        const int srcA= src[-1*srcStride];\
2342
        const int src0= src[0 *srcStride];\
2343
        const int src1= src[1 *srcStride];\
2344
        const int src2= src[2 *srcStride];\
2345
        const int src3= src[3 *srcStride];\
2346
        const int src4= src[4 *srcStride];\
2347
        const int src5= src[5 *srcStride];\
2348
        const int src6= src[6 *srcStride];\
2349
        const int src7= src[7 *srcStride];\
2350
        const int src8= src[8 *srcStride];\
2351
        const int src9= src[9 *srcStride];\
2352
        const int src10=src[10*srcStride];\
2353
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2354
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2355
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2356
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2357
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2358
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2359
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2360
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2361
        dst++;\
2362
        src++;\
2363
    }\
2364
}\
2365
\
2366
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2367
    const int h=8;\
2368
    const int w=8;\
2369
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2370
    int i;\
2371
    src -= 2*srcStride;\
2372
    for(i=0; i<h+5; i++)\
2373
    {\
2374
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2375
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2376
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2377
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2378
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2379
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2380
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2381
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2382
        tmp+=tmpStride;\
2383
        src+=srcStride;\
2384
    }\
2385
    tmp -= tmpStride*(h+5-2);\
2386
    for(i=0; i<w; i++)\
2387
    {\
2388
        const int tmpB= tmp[-2*tmpStride];\
2389
        const int tmpA= tmp[-1*tmpStride];\
2390
        const int tmp0= tmp[0 *tmpStride];\
2391
        const int tmp1= tmp[1 *tmpStride];\
2392
        const int tmp2= tmp[2 *tmpStride];\
2393
        const int tmp3= tmp[3 *tmpStride];\
2394
        const int tmp4= tmp[4 *tmpStride];\
2395
        const int tmp5= tmp[5 *tmpStride];\
2396
        const int tmp6= tmp[6 *tmpStride];\
2397
        const int tmp7= tmp[7 *tmpStride];\
2398
        const int tmp8= tmp[8 *tmpStride];\
2399
        const int tmp9= tmp[9 *tmpStride];\
2400
        const int tmp10=tmp[10*tmpStride];\
2401
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2402
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2403
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2404
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2405
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2406
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2407
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2408
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2409
        dst++;\
2410
        tmp++;\
2411
    }\
2412
}\
2413
\
2414
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2415
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2416
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2417
    src += 8*srcStride;\
2418
    dst += 8*dstStride;\
2419
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2420
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2421
}\
2422
\
2423
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2425
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2426
    src += 8*srcStride;\
2427
    dst += 8*dstStride;\
2428
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2429
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2430
}\
2431
\
2432
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2433
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2434
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2435
    src += 8*srcStride;\
2436
    dst += 8*dstStride;\
2437
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2438
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2439
}\
2440

    
2441
#define H264_MC(OPNAME, SIZE) \
2442
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2443
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2444
}\
2445
\
2446
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2447
    uint8_t half[SIZE*SIZE];\
2448
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2449
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2450
}\
2451
\
2452
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2453
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2454
}\
2455
\
2456
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2457
    uint8_t half[SIZE*SIZE];\
2458
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2459
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2460
}\
2461
\
2462
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2463
    uint8_t full[SIZE*(SIZE+5)];\
2464
    uint8_t * const full_mid= full + SIZE*2;\
2465
    uint8_t half[SIZE*SIZE];\
2466
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2467
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2468
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2469
}\
2470
\
2471
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2472
    uint8_t full[SIZE*(SIZE+5)];\
2473
    uint8_t * const full_mid= full + SIZE*2;\
2474
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2475
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2476
}\
2477
\
2478
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2479
    uint8_t full[SIZE*(SIZE+5)];\
2480
    uint8_t * const full_mid= full + SIZE*2;\
2481
    uint8_t half[SIZE*SIZE];\
2482
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2483
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2484
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2485
}\
2486
\
2487
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2488
    uint8_t full[SIZE*(SIZE+5)];\
2489
    uint8_t * const full_mid= full + SIZE*2;\
2490
    uint8_t halfH[SIZE*SIZE];\
2491
    uint8_t halfV[SIZE*SIZE];\
2492
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2493
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496
}\
2497
\
2498
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2499
    uint8_t full[SIZE*(SIZE+5)];\
2500
    uint8_t * const full_mid= full + SIZE*2;\
2501
    uint8_t halfH[SIZE*SIZE];\
2502
    uint8_t halfV[SIZE*SIZE];\
2503
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2505
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2506
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2507
}\
2508
\
2509
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2510
    uint8_t full[SIZE*(SIZE+5)];\
2511
    uint8_t * const full_mid= full + SIZE*2;\
2512
    uint8_t halfH[SIZE*SIZE];\
2513
    uint8_t halfV[SIZE*SIZE];\
2514
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2516
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2517
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2518
}\
2519
\
2520
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2521
    uint8_t full[SIZE*(SIZE+5)];\
2522
    uint8_t * const full_mid= full + SIZE*2;\
2523
    uint8_t halfH[SIZE*SIZE];\
2524
    uint8_t halfV[SIZE*SIZE];\
2525
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2526
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2527
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2529
}\
2530
\
2531
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2532
    int16_t tmp[SIZE*(SIZE+5)];\
2533
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2534
}\
2535
\
2536
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2537
    int16_t tmp[SIZE*(SIZE+5)];\
2538
    uint8_t halfH[SIZE*SIZE];\
2539
    uint8_t halfHV[SIZE*SIZE];\
2540
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2541
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2543
}\
2544
\
2545
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2546
    int16_t tmp[SIZE*(SIZE+5)];\
2547
    uint8_t halfH[SIZE*SIZE];\
2548
    uint8_t halfHV[SIZE*SIZE];\
2549
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2550
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2551
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2552
}\
2553
\
2554
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2555
    uint8_t full[SIZE*(SIZE+5)];\
2556
    uint8_t * const full_mid= full + SIZE*2;\
2557
    int16_t tmp[SIZE*(SIZE+5)];\
2558
    uint8_t halfV[SIZE*SIZE];\
2559
    uint8_t halfHV[SIZE*SIZE];\
2560
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2561
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2562
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2563
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2564
}\
2565
\
2566
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2567
    uint8_t full[SIZE*(SIZE+5)];\
2568
    uint8_t * const full_mid= full + SIZE*2;\
2569
    int16_t tmp[SIZE*(SIZE+5)];\
2570
    uint8_t halfV[SIZE*SIZE];\
2571
    uint8_t halfHV[SIZE*SIZE];\
2572
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2573
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2574
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2575
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2576
}\
2577

    
2578
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2579
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2580
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2581
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2582
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2583

    
2584
H264_LOWPASS(put_       , op_put, op2_put)
2585
H264_LOWPASS(avg_       , op_avg, op2_avg)
2586
H264_MC(put_, 2)
2587
H264_MC(put_, 4)
2588
H264_MC(put_, 8)
2589
H264_MC(put_, 16)
2590
H264_MC(avg_, 4)
2591
H264_MC(avg_, 8)
2592
H264_MC(avg_, 16)
2593

    
2594
#undef op_avg
2595
#undef op_put
2596
#undef op2_avg
2597
#undef op2_put
2598
#endif
2599

    
2600
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2601
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2602
    int i;
2603

    
2604
    for(i=0; i<h; i++){
2605
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2606
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2607
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2608
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2609
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2610
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2611
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2612
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2613
        dst+=dstStride;
2614
        src+=srcStride;
2615
    }
2616
}
2617

    
2618
#if CONFIG_CAVS_DECODER
2619
/* AVS specific */
2620
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2621
    put_pixels8_c(dst, src, stride, 8);
2622
}
2623
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2624
    avg_pixels8_c(dst, src, stride, 8);
2625
}
2626
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2627
    put_pixels16_c(dst, src, stride, 16);
2628
}
2629
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2630
    avg_pixels16_c(dst, src, stride, 16);
2631
}
2632
#endif /* CONFIG_CAVS_DECODER */
2633

    
2634
#if CONFIG_VC1_DECODER
2635
/* VC-1 specific */
2636
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2637
    put_pixels8_c(dst, src, stride, 8);
2638
}
2639
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2640
    avg_pixels8_c(dst, src, stride, 8);
2641
}
2642
#endif /* CONFIG_VC1_DECODER */
2643

    
2644
#if CONFIG_RV40_DECODER
2645
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2646
    put_pixels16_xy2_c(dst, src, stride, 16);
2647
}
2648
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2649
    avg_pixels16_xy2_c(dst, src, stride, 16);
2650
}
2651
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2652
    put_pixels8_xy2_c(dst, src, stride, 8);
2653
}
2654
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2655
    avg_pixels8_xy2_c(dst, src, stride, 8);
2656
}
2657
#endif /* CONFIG_RV40_DECODER */
2658

    
2659
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2660
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2661
    int i;
2662

    
2663
    for(i=0; i<w; i++){
2664
        const int src_1= src[ -srcStride];
2665
        const int src0 = src[0          ];
2666
        const int src1 = src[  srcStride];
2667
        const int src2 = src[2*srcStride];
2668
        const int src3 = src[3*srcStride];
2669
        const int src4 = src[4*srcStride];
2670
        const int src5 = src[5*srcStride];
2671
        const int src6 = src[6*srcStride];
2672
        const int src7 = src[7*srcStride];
2673
        const int src8 = src[8*srcStride];
2674
        const int src9 = src[9*srcStride];
2675
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2676
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2677
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2678
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2679
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2680
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2681
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2682
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2683
        src++;
2684
        dst++;
2685
    }
2686
}
2687

    
2688
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2689
    put_pixels8_c(dst, src, stride, 8);
2690
}
2691

    
2692
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2693
    uint8_t half[64];
2694
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2695
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2696
}
2697

    
2698
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2699
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2700
}
2701

    
2702
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2703
    uint8_t half[64];
2704
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2705
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2706
}
2707

    
2708
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2709
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2710
}
2711

    
2712
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2713
    uint8_t halfH[88];
2714
    uint8_t halfV[64];
2715
    uint8_t halfHV[64];
2716
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2717
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2718
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2719
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2720
}
2721
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2722
    uint8_t halfH[88];
2723
    uint8_t halfV[64];
2724
    uint8_t halfHV[64];
2725
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2726
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2727
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2728
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2729
}
2730
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2731
    uint8_t halfH[88];
2732
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2733
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2734
}
2735

    
2736
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2737
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738
    int x;
2739
    const int strength= ff_h263_loop_filter_strength[qscale];
2740

    
2741
    for(x=0; x<8; x++){
2742
        int d1, d2, ad1;
2743
        int p0= src[x-2*stride];
2744
        int p1= src[x-1*stride];
2745
        int p2= src[x+0*stride];
2746
        int p3= src[x+1*stride];
2747
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2748

    
2749
        if     (d<-2*strength) d1= 0;
2750
        else if(d<-  strength) d1=-2*strength - d;
2751
        else if(d<   strength) d1= d;
2752
        else if(d< 2*strength) d1= 2*strength - d;
2753
        else                   d1= 0;
2754

    
2755
        p1 += d1;
2756
        p2 -= d1;
2757
        if(p1&256) p1= ~(p1>>31);
2758
        if(p2&256) p2= ~(p2>>31);
2759

    
2760
        src[x-1*stride] = p1;
2761
        src[x+0*stride] = p2;
2762

    
2763
        ad1= FFABS(d1)>>1;
2764

    
2765
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2766

    
2767
        src[x-2*stride] = p0 - d2;
2768
        src[x+  stride] = p3 + d2;
2769
    }
2770
    }
2771
}
2772

    
2773
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2774
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2775
    int y;
2776
    const int strength= ff_h263_loop_filter_strength[qscale];
2777

    
2778
    for(y=0; y<8; y++){
2779
        int d1, d2, ad1;
2780
        int p0= src[y*stride-2];
2781
        int p1= src[y*stride-1];
2782
        int p2= src[y*stride+0];
2783
        int p3= src[y*stride+1];
2784
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2785

    
2786
        if     (d<-2*strength) d1= 0;
2787
        else if(d<-  strength) d1=-2*strength - d;
2788
        else if(d<   strength) d1= d;
2789
        else if(d< 2*strength) d1= 2*strength - d;
2790
        else                   d1= 0;
2791

    
2792
        p1 += d1;
2793
        p2 -= d1;
2794
        if(p1&256) p1= ~(p1>>31);
2795
        if(p2&256) p2= ~(p2>>31);
2796

    
2797
        src[y*stride-1] = p1;
2798
        src[y*stride+0] = p2;
2799

    
2800
        ad1= FFABS(d1)>>1;
2801

    
2802
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2803

    
2804
        src[y*stride-2] = p0 - d2;
2805
        src[y*stride+1] = p3 + d2;
2806
    }
2807
    }
2808
}
2809

    
2810
static void h261_loop_filter_c(uint8_t *src, int stride){
2811
    int x,y,xy,yz;
2812
    int temp[64];
2813

    
2814
    for(x=0; x<8; x++){
2815
        temp[x      ] = 4*src[x           ];
2816
        temp[x + 7*8] = 4*src[x + 7*stride];
2817
    }
2818
    for(y=1; y<7; y++){
2819
        for(x=0; x<8; x++){
2820
            xy = y * stride + x;
2821
            yz = y * 8 + x;
2822
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2823
        }
2824
    }
2825

    
2826
    for(y=0; y<8; y++){
2827
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2828
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2829
        for(x=1; x<7; x++){
2830
            xy = y * stride + x;
2831
            yz = y * 8 + x;
2832
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2833
        }
2834
    }
2835
}
2836

    
2837
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2838
{
2839
    int s, i;
2840

    
2841
    s = 0;
2842
    for(i=0;i<h;i++) {
2843
        s += abs(pix1[0] - pix2[0]);
2844
        s += abs(pix1[1] - pix2[1]);
2845
        s += abs(pix1[2] - pix2[2]);
2846
        s += abs(pix1[3] - pix2[3]);
2847
        s += abs(pix1[4] - pix2[4]);
2848
        s += abs(pix1[5] - pix2[5]);
2849
        s += abs(pix1[6] - pix2[6]);
2850
        s += abs(pix1[7] - pix2[7]);
2851
        s += abs(pix1[8] - pix2[8]);
2852
        s += abs(pix1[9] - pix2[9]);
2853
        s += abs(pix1[10] - pix2[10]);
2854
        s += abs(pix1[11] - pix2[11]);
2855
        s += abs(pix1[12] - pix2[12]);
2856
        s += abs(pix1[13] - pix2[13]);
2857
        s += abs(pix1[14] - pix2[14]);
2858
        s += abs(pix1[15] - pix2[15]);
2859
        pix1 += line_size;
2860
        pix2 += line_size;
2861
    }
2862
    return s;
2863
}
2864

    
2865
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866
{
2867
    int s, i;
2868

    
2869
    s = 0;
2870
    for(i=0;i<h;i++) {
2871
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2872
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2873
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2874
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2875
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2876
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2877
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2878
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2879
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2880
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2881
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2882
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2883
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2884
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2885
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2886
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2887
        pix1 += line_size;
2888
        pix2 += line_size;
2889
    }
2890
    return s;
2891
}
2892

    
2893
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894
{
2895
    int s, i;
2896
    uint8_t *pix3 = pix2 + line_size;
2897

    
2898
    s = 0;
2899
    for(i=0;i<h;i++) {
2900
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2901
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2902
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2903
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2904
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2905
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2906
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2907
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2908
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2909
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2910
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2911
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2912
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2913
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2914
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2915
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2916
        pix1 += line_size;
2917
        pix2 += line_size;
2918
        pix3 += line_size;
2919
    }
2920
    return s;
2921
}
2922

    
2923
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2924
{
2925
    int s, i;
2926
    uint8_t *pix3 = pix2 + line_size;
2927

    
2928
    s = 0;
2929
    for(i=0;i<h;i++) {
2930
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2931
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2932
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2933
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2934
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2935
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2936
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2937
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2938
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2939
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2940
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2941
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2942
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2943
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2944
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2945
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2946
        pix1 += line_size;
2947
        pix2 += line_size;
2948
        pix3 += line_size;
2949
    }
2950
    return s;
2951
}
2952

    
2953
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2954
{
2955
    int s, i;
2956

    
2957
    s = 0;
2958
    for(i=0;i<h;i++) {
2959
        s += abs(pix1[0] - pix2[0]);
2960
        s += abs(pix1[1] - pix2[1]);
2961
        s += abs(pix1[2] - pix2[2]);
2962
        s += abs(pix1[3] - pix2[3]);
2963
        s += abs(pix1[4] - pix2[4]);
2964
        s += abs(pix1[5] - pix2[5]);
2965
        s += abs(pix1[6] - pix2[6]);
2966
        s += abs(pix1[7] - pix2[7]);
2967
        pix1 += line_size;
2968
        pix2 += line_size;
2969
    }
2970
    return s;
2971
}
2972

    
2973
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2974
{
2975
    int s, i;
2976

    
2977
    s = 0;
2978
    for(i=0;i<h;i++) {
2979
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2980
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2981
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2982
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2983
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2984
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2985
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2986
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2987
        pix1 += line_size;
2988
        pix2 += line_size;
2989
    }
2990
    return s;
2991
}
2992

    
2993
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994
{
2995
    int s, i;
2996
    uint8_t *pix3 = pix2 + line_size;
2997

    
2998
    s = 0;
2999
    for(i=0;i<h;i++) {
3000
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3001
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3002
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3003
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3004
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3005
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3006
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3007
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3008
        pix1 += line_size;
3009
        pix2 += line_size;
3010
        pix3 += line_size;
3011
    }
3012
    return s;
3013
}
3014

    
3015
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3016
{
3017
    int s, i;
3018
    uint8_t *pix3 = pix2 + line_size;
3019

    
3020
    s = 0;
3021
    for(i=0;i<h;i++) {
3022
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3023
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3024
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3025
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3026
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3027
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3028
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3029
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3030
        pix1 += line_size;
3031
        pix2 += line_size;
3032
        pix3 += line_size;
3033
    }
3034
    return s;
3035
}
3036

    
3037
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3038
    MpegEncContext *c = v;
3039
    int score1=0;
3040
    int score2=0;
3041
    int x,y;
3042

    
3043
    for(y=0; y<h; y++){
3044
        for(x=0; x<16; x++){
3045
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3046
        }
3047
        if(y+1<h){
3048
            for(x=0; x<15; x++){
3049
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3050
                             - s1[x+1] + s1[x+1+stride])
3051
                        -FFABS(  s2[x  ] - s2[x  +stride]
3052
                             - s2[x+1] + s2[x+1+stride]);
3053
            }
3054
        }
3055
        s1+= stride;
3056
        s2+= stride;
3057
    }
3058

    
3059
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3060
    else  return score1 + FFABS(score2)*8;
3061
}
3062

    
3063
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3064
    MpegEncContext *c = v;
3065
    int score1=0;
3066
    int score2=0;
3067
    int x,y;
3068

    
3069
    for(y=0; y<h; y++){
3070
        for(x=0; x<8; x++){
3071
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3072
        }
3073
        if(y+1<h){
3074
            for(x=0; x<7; x++){
3075
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3076
                             - s1[x+1] + s1[x+1+stride])
3077
                        -FFABS(  s2[x  ] - s2[x  +stride]
3078
                             - s2[x+1] + s2[x+1+stride]);
3079
            }
3080
        }
3081
        s1+= stride;
3082
        s2+= stride;
3083
    }
3084

    
3085
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3086
    else  return score1 + FFABS(score2)*8;
3087
}
3088

    
3089
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3090
    int i;
3091
    unsigned int sum=0;
3092

    
3093
    for(i=0; i<8*8; i++){
3094
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3095
        int w= weight[i];
3096
        b>>= RECON_SHIFT;
3097
        assert(-512<b && b<512);
3098

    
3099
        sum += (w*b)*(w*b)>>4;
3100
    }
3101
    return sum>>2;
3102
}
3103

    
3104
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3105
    int i;
3106

    
3107
    for(i=0; i<8*8; i++){
3108
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3109
    }
3110
}
3111

    
3112
/**
3113
 * permutes an 8x8 block.
3114
 * @param block the block which will be permuted according to the given permutation vector
3115
 * @param permutation the permutation vector
3116
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3117
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3118
 *                  (inverse) permutated to scantable order!
3119
 */
3120
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3121
{
3122
    int i;
3123
    DCTELEM temp[64];
3124

    
3125
    if(last<=0) return;
3126
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3127

    
3128
    for(i=0; i<=last; i++){
3129
        const int j= scantable[i];
3130
        temp[j]= block[j];
3131
        block[j]=0;
3132
    }
3133

    
3134
    for(i=0; i<=last; i++){
3135
        const int j= scantable[i];
3136
        const int perm_j= permutation[j];
3137
        block[perm_j]= temp[j];
3138
    }
3139
}
3140

    
3141
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3142
    return 0;
3143
}
3144

    
3145
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3146
    int i;
3147

    
3148
    memset(cmp, 0, sizeof(void*)*6);
3149

    
3150
    for(i=0; i<6; i++){
3151
        switch(type&0xFF){
3152
        case FF_CMP_SAD:
3153
            cmp[i]= c->sad[i];
3154
            break;
3155
        case FF_CMP_SATD:
3156
            cmp[i]= c->hadamard8_diff[i];
3157
            break;
3158
        case FF_CMP_SSE:
3159
            cmp[i]= c->sse[i];
3160
            break;
3161
        case FF_CMP_DCT:
3162
            cmp[i]= c->dct_sad[i];
3163
            break;
3164
        case FF_CMP_DCT264:
3165
            cmp[i]= c->dct264_sad[i];
3166
            break;
3167
        case FF_CMP_DCTMAX:
3168
            cmp[i]= c->dct_max[i];
3169
            break;
3170
        case FF_CMP_PSNR:
3171
            cmp[i]= c->quant_psnr[i];
3172
            break;
3173
        case FF_CMP_BIT:
3174
            cmp[i]= c->bit[i];
3175
            break;
3176
        case FF_CMP_RD:
3177
            cmp[i]= c->rd[i];
3178
            break;
3179
        case FF_CMP_VSAD:
3180
            cmp[i]= c->vsad[i];
3181
            break;
3182
        case FF_CMP_VSSE:
3183
            cmp[i]= c->vsse[i];
3184
            break;
3185
        case FF_CMP_ZERO:
3186
            cmp[i]= zero_cmp;
3187
            break;
3188
        case FF_CMP_NSSE:
3189
            cmp[i]= c->nsse[i];
3190
            break;
3191
#if CONFIG_DWT
3192
        case FF_CMP_W53:
3193
            cmp[i]= c->w53[i];
3194
            break;
3195
        case FF_CMP_W97:
3196
            cmp[i]= c->w97[i];
3197
            break;
3198
#endif
3199
        default:
3200
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3201
        }
3202
    }
3203
}
3204

    
3205
static void clear_block_c(DCTELEM *block)
3206
{
3207
    memset(block, 0, sizeof(DCTELEM)*64);
3208
}
3209

    
3210
/**
3211
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3212
 */
3213
static void clear_blocks_c(DCTELEM *blocks)
3214
{
3215
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3216
}
3217

    
3218
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3219
    long i;
3220
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3221
        long a = *(long*)(src+i);
3222
        long b = *(long*)(dst+i);
3223
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3224
    }
3225
    for(; i<w; i++)
3226
        dst[i+0] += src[i+0];
3227
}
3228

    
3229
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3230
    long i;
3231
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3232
        long a = *(long*)(src1+i);
3233
        long b = *(long*)(src2+i);
3234
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3235
    }
3236
    for(; i<w; i++)
3237
        dst[i] = src1[i]+src2[i];
3238
}
3239

    
3240
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3241
    long i;
3242
#if !HAVE_FAST_UNALIGNED
3243
    if((long)src2 & (sizeof(long)-1)){
3244
        for(i=0; i+7<w; i+=8){
3245
            dst[i+0] = src1[i+0]-src2[i+0];
3246
            dst[i+1] = src1[i+1]-src2[i+1];
3247
            dst[i+2] = src1[i+2]-src2[i+2];
3248
            dst[i+3] = src1[i+3]-src2[i+3];
3249
            dst[i+4] = src1[i+4]-src2[i+4];
3250
            dst[i+5] = src1[i+5]-src2[i+5];
3251
            dst[i+6] = src1[i+6]-src2[i+6];
3252
            dst[i+7] = src1[i+7]-src2[i+7];
3253
        }
3254
    }else
3255
#endif
3256
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3257
        long a = *(long*)(src1+i);
3258
        long b = *(long*)(src2+i);
3259
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3260
    }
3261
    for(; i<w; i++)
3262
        dst[i+0] = src1[i+0]-src2[i+0];
3263
}
3264

    
3265
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3266
    int i;
3267
    uint8_t l, lt;
3268

    
3269
    l= *left;
3270
    lt= *left_top;
3271

    
3272
    for(i=0; i<w; i++){
3273
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3274
        lt= src1[i];
3275
        dst[i]= l;
3276
    }
3277

    
3278
    *left= l;
3279
    *left_top= lt;
3280
}
3281

    
3282
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3283
    int i;
3284
    uint8_t l, lt;
3285

    
3286
    l= *left;
3287
    lt= *left_top;
3288

    
3289
    for(i=0; i<w; i++){
3290
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3291
        lt= src1[i];
3292
        l= src2[i];
3293
        dst[i]= l - pred;
3294
    }
3295

    
3296
    *left= l;
3297
    *left_top= lt;
3298
}
3299

    
3300
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3301
    int i;
3302

    
3303
    for(i=0; i<w-1; i++){
3304
        acc+= src[i];
3305
        dst[i]= acc;
3306
        i++;
3307
        acc+= src[i];
3308
        dst[i]= acc;
3309
    }
3310

    
3311
    for(; i<w; i++){
3312
        acc+= src[i];
3313
        dst[i]= acc;
3314
    }
3315

    
3316
    return acc;
3317
}
3318

    
3319
#if HAVE_BIGENDIAN
3320
#define B 3
3321
#define G 2
3322
#define R 1
3323
#define A 0
3324
#else
3325
#define B 0
3326
#define G 1
3327
#define R 2
3328
#define A 3
3329
#endif
3330
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3331
    int i;
3332
    int r,g,b,a;
3333
    r= *red;
3334
    g= *green;
3335
    b= *blue;
3336
    a= *alpha;
3337

    
3338
    for(i=0; i<w; i++){
3339
        b+= src[4*i+B];
3340
        g+= src[4*i+G];
3341
        r+= src[4*i+R];
3342
        a+= src[4*i+A];
3343

    
3344
        dst[4*i+B]= b;
3345
        dst[4*i+G]= g;
3346
        dst[4*i+R]= r;
3347
        dst[4*i+A]= a;
3348
    }
3349

    
3350
    *red= r;
3351
    *green= g;
3352
    *blue= b;
3353
    *alpha= a;
3354
}
3355
#undef B
3356
#undef G
3357
#undef R
3358
#undef A
3359

    
3360
#define BUTTERFLY2(o1,o2,i1,i2) \
3361
o1= (i1)+(i2);\
3362
o2= (i1)-(i2);
3363

    
3364
#define BUTTERFLY1(x,y) \
3365
{\
3366
    int a,b;\
3367
    a= x;\
3368
    b= y;\
3369
    x= a+b;\
3370
    y= a-b;\
3371
}
3372

    
3373
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3374

    
3375
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3376
    int i;
3377
    int temp[64];
3378
    int sum=0;
3379

    
3380
    assert(h==8);
3381

    
3382
    for(i=0; i<8; i++){
3383
        //FIXME try pointer walks
3384
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3385
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3386
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3387
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3388

    
3389
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3390
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3391
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3392
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3393

    
3394
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3395
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3396
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3397
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3398
    }
3399

    
3400
    for(i=0; i<8; i++){
3401
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3402
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3403
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3404
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3405

    
3406
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3407
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3408
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3409
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3410

    
3411
        sum +=
3412
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3413
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3414
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3415
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3416
    }
3417
#if 0
3418
static int maxi=0;
3419
if(sum>maxi){
3420
    maxi=sum;
3421
    printf("MAX:%d\n", maxi);
3422
}
3423
#endif
3424
    return sum;
3425
}
3426

    
3427
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3428
    int i;
3429
    int temp[64];
3430
    int sum=0;
3431

    
3432
    assert(h==8);
3433

    
3434
    for(i=0; i<8; i++){
3435
        //FIXME try pointer walks
3436
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3437
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3438
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3439
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3440

    
3441
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3442
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3443
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3444
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3445

    
3446
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3447
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3448
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3449
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3450
    }
3451

    
3452
    for(i=0; i<8; i++){
3453
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3454
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3455
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3456
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3457

    
3458
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3459
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3460
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3461
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3462

    
3463
        sum +=
3464
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3465
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3466
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3467
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3468
    }
3469

    
3470
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3471

    
3472
    return sum;
3473
}
3474

    
3475
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3476
    MpegEncContext * const s= (MpegEncContext *)c;
3477
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3478

    
3479
    assert(h==8);
3480

    
3481
    s->dsp.diff_pixels(temp, src1, src2, stride);
3482
    s->dsp.fdct(temp);
3483
    return s->dsp.sum_abs_dctelem(temp);
3484
}
3485

    
3486
#if CONFIG_GPL
3487
#define DCT8_1D {\
3488
    const int s07 = SRC(0) + SRC(7);\
3489
    const int s16 = SRC(1) + SRC(6);\
3490
    const int s25 = SRC(2) + SRC(5);\
3491
    const int s34 = SRC(3) + SRC(4);\
3492
    const int a0 = s07 + s34;\
3493
    const int a1 = s16 + s25;\
3494
    const int a2 = s07 - s34;\
3495
    const int a3 = s16 - s25;\
3496
    const int d07 = SRC(0) - SRC(7);\
3497
    const int d16 = SRC(1) - SRC(6);\
3498
    const int d25 = SRC(2) - SRC(5);\
3499
    const int d34 = SRC(3) - SRC(4);\
3500
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3501
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3502
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3503
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3504
    DST(0,  a0 + a1     ) ;\
3505
    DST(1,  a4 + (a7>>2)) ;\
3506
    DST(2,  a2 + (a3>>1)) ;\
3507
    DST(3,  a5 + (a6>>2)) ;\
3508
    DST(4,  a0 - a1     ) ;\
3509
    DST(5,  a6 - (a5>>2)) ;\
3510
    DST(6, (a2>>1) - a3 ) ;\
3511
    DST(7, (a4>>2) - a7 ) ;\
3512
}
3513

    
3514
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3515
    MpegEncContext * const s= (MpegEncContext *)c;
3516
    DCTELEM dct[8][8];
3517
    int i;
3518
    int sum=0;
3519

    
3520
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3521

    
3522
#define SRC(x) dct[i][x]
3523
#define DST(x,v) dct[i][x]= v
3524
    for( i = 0; i < 8; i++ )
3525
        DCT8_1D
3526
#undef SRC
3527
#undef DST
3528

    
3529
#define SRC(x) dct[x][i]
3530
#define DST(x,v) sum += FFABS(v)
3531
    for( i = 0; i < 8; i++ )
3532
        DCT8_1D
3533
#undef SRC
3534
#undef DST
3535
    return sum;
3536
}
3537
#endif
3538

    
3539
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3540
    MpegEncContext * const s= (MpegEncContext *)c;
3541
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3542
    int sum=0, i;
3543

    
3544
    assert(h==8);
3545

    
3546
    s->dsp.diff_pixels(temp, src1, src2, stride);
3547
    s->dsp.fdct(temp);
3548

    
3549
    for(i=0; i<64; i++)
3550
        sum= FFMAX(sum, FFABS(temp[i]));
3551

    
3552
    return sum;
3553
}
3554

    
3555
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3556
    MpegEncContext * const s= (MpegEncContext *)c;
3557
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3558
    DCTELEM * const bak = temp+64;
3559
    int sum=0, i;
3560

    
3561
    assert(h==8);
3562
    s->mb_intra=0;
3563

    
3564
    s->dsp.diff_pixels(temp, src1, src2, stride);
3565

    
3566
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3567

    
3568
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3569
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3570
    ff_simple_idct(temp); //FIXME
3571

    
3572
    for(i=0; i<64; i++)
3573
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3574

    
3575
    return sum;
3576
}
3577

    
3578
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3579
    MpegEncContext * const s= (MpegEncContext *)c;
3580
    const uint8_t *scantable= s->intra_scantable.permutated;
3581
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3582
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3583
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3584
    int i, last, run, bits, level, distortion, start_i;
3585
    const int esc_length= s->ac_esc_length;
3586
    uint8_t * length;
3587
    uint8_t * last_length;
3588

    
3589
    assert(h==8);
3590

    
3591
    copy_block8(lsrc1, src1, 8, stride, 8);
3592
    copy_block8(lsrc2, src2, 8, stride, 8);
3593

    
3594
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3595

    
3596
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3597

    
3598
    bits=0;
3599

    
3600
    if (s->mb_intra) {
3601
        start_i = 1;
3602
        length     = s->intra_ac_vlc_length;
3603
        last_length= s->intra_ac_vlc_last_length;
3604
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3605
    } else {
3606
        start_i = 0;
3607
        length     = s->inter_ac_vlc_length;
3608
        last_length= s->inter_ac_vlc_last_length;
3609
    }
3610

    
3611
    if(last>=start_i){
3612
        run=0;
3613
        for(i=start_i; i<last; i++){
3614
            int j= scantable[i];
3615
            level= temp[j];
3616

    
3617
            if(level){
3618
                level+=64;
3619
                if((level&(~127)) == 0){
3620
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3621
                }else
3622
                    bits+= esc_length;
3623
                run=0;
3624
            }else
3625
                run++;
3626
        }
3627
        i= scantable[last];
3628

    
3629
        level= temp[i] + 64;
3630

    
3631
        assert(level - 64);
3632

    
3633
        if((level&(~127)) == 0){
3634
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3635
        }else
3636
            bits+= esc_length;
3637

    
3638
    }
3639

    
3640
    if(last>=0){
3641
        if(s->mb_intra)
3642
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3643
        else
3644
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3645
    }
3646

    
3647
    s->dsp.idct_add(lsrc2, 8, temp);
3648

    
3649
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3650

    
3651
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3652
}
3653

    
3654
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3655
    MpegEncContext * const s= (MpegEncContext *)c;
3656
    const uint8_t *scantable= s->intra_scantable.permutated;
3657
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3658
    int i, last, run, bits, level, start_i;
3659
    const int esc_length= s->ac_esc_length;
3660
    uint8_t * length;
3661
    uint8_t * last_length;
3662

    
3663
    assert(h==8);
3664

    
3665
    s->dsp.diff_pixels(temp, src1, src2, stride);
3666

    
3667
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3668

    
3669
    bits=0;
3670

    
3671
    if (s->mb_intra) {
3672
        start_i = 1;
3673
        length     = s->intra_ac_vlc_length;
3674
        last_length= s->intra_ac_vlc_last_length;
3675
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3676
    } else {
3677
        start_i = 0;
3678
        length     = s->inter_ac_vlc_length;
3679
        last_length= s->inter_ac_vlc_last_length;
3680
    }
3681

    
3682
    if(last>=start_i){
3683
        run=0;
3684
        for(i=start_i; i<last; i++){
3685
            int j= scantable[i];
3686
            level= temp[j];
3687

    
3688
            if(level){
3689
                level+=64;
3690
                if((level&(~127)) == 0){
3691
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3692
                }else
3693
                    bits+= esc_length;
3694
                run=0;
3695
            }else
3696
                run++;
3697
        }
3698
        i= scantable[last];
3699

    
3700
        level= temp[i] + 64;
3701

    
3702
        assert(level - 64);
3703

    
3704
        if((level&(~127)) == 0){
3705
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3706
        }else
3707
            bits+= esc_length;
3708
    }
3709

    
3710
    return bits;
3711
}
3712

    
3713
#define VSAD_INTRA(size) \
3714
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3715
    int score=0;                                                                                            \
3716
    int x,y;                                                                                                \
3717
                                                                                                            \
3718
    for(y=1; y<h; y++){                                                                                     \
3719
        for(x=0; x<size; x+=4){                                                                             \
3720
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3721
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3722
        }                                                                                                   \
3723
        s+= stride;                                                                                         \
3724
    }                                                                                                       \
3725
                                                                                                            \
3726
    return score;                                                                                           \
3727
}
3728
VSAD_INTRA(8)
3729
VSAD_INTRA(16)
3730

    
3731
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3732
    int score=0;
3733
    int x,y;
3734

    
3735
    for(y=1; y<h; y++){
3736
        for(x=0; x<16; x++){
3737
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3738
        }
3739
        s1+= stride;
3740
        s2+= stride;
3741
    }
3742

    
3743
    return score;
3744
}
3745

    
3746
#define SQ(a) ((a)*(a))
3747
#define VSSE_INTRA(size) \
3748
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3749
    int score=0;                                                                                            \
3750
    int x,y;                                                                                                \
3751
                                                                                                            \
3752
    for(y=1; y<h; y++){                                                                                     \
3753
        for(x=0; x<size; x+=4){                                                                               \
3754
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3755
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3756
        }                                                                                                   \
3757
        s+= stride;                                                                                         \
3758
    }                                                                                                       \
3759
                                                                                                            \
3760
    return score;                                                                                           \
3761
}
3762
VSSE_INTRA(8)
3763
VSSE_INTRA(16)
3764

    
3765
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3766
    int score=0;
3767
    int x,y;
3768

    
3769
    for(y=1; y<h; y++){
3770
        for(x=0; x<16; x++){
3771
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3772
        }
3773
        s1+= stride;
3774
        s2+= stride;
3775
    }
3776

    
3777
    return score;
3778
}
3779

    
3780
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3781
                               int size){
3782
    int score=0;
3783
    int i;
3784
    for(i=0; i<size; i++)
3785
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3786
    return score;
3787
}
3788

    
3789
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3790
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3791
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3792
#if CONFIG_GPL
3793
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3794
#endif
3795
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3796
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3797
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3798
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3799

    
3800
static void vector_fmul_c(float *dst, const float *src, int len){
3801
    int i;
3802
    for(i=0; i<len; i++)
3803
        dst[i] *= src[i];
3804
}
3805

    
3806
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3807
    int i;
3808
    src1 += len-1;
3809
    for(i=0; i<len; i++)
3810
        dst[i] = src0[i] * src1[-i];
3811
}
3812

    
3813
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3814
    int i;
3815
    for(i=0; i<len; i++)
3816
        dst[i] = src0[i] * src1[i] + src2[i];
3817
}
3818

    
3819
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3820
    int i,j;
3821
    dst += len;
3822
    win += len;
3823
    src0+= len;
3824
    for(i=-len, j=len-1; i<0; i++, j--) {
3825
        float s0 = src0[i];
3826
        float s1 = src1[j];
3827
        float wi = win[i];
3828
        float wj = win[j];
3829
        dst[i] = s0*wj - s1*wi + add_bias;
3830
        dst[j] = s0*wi + s1*wj + add_bias;
3831
    }
3832
}
3833

    
3834
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3835
                                 int len)
3836
{
3837
    int i;
3838
    for (i = 0; i < len; i++)
3839
        dst[i] = src[i] * mul;
3840
}
3841

    
3842
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3843
                                      const float **sv, float mul, int len)
3844
{
3845
    int i;
3846
    for (i = 0; i < len; i += 2, sv++) {
3847
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3848
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3849
    }
3850
}
3851

    
3852
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3853
                                      const float **sv, float mul, int len)
3854
{
3855
    int i;
3856
    for (i = 0; i < len; i += 4, sv++) {
3857
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3858
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3859
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3860
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3861
    }
3862
}
3863

    
3864
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3865
                               int len)
3866
{
3867
    int i;
3868
    for (i = 0; i < len; i += 2, sv++) {
3869
        dst[i  ] = sv[0][0] * mul;
3870
        dst[i+1] = sv[0][1] * mul;
3871
    }
3872
}
3873

    
3874
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3875
                               int len)
3876
{
3877
    int i;
3878
    for (i = 0; i < len; i += 4, sv++) {
3879
        dst[i  ] = sv[0][0] * mul;
3880
        dst[i+1] = sv[0][1] * mul;
3881
        dst[i+2] = sv[0][2] * mul;
3882
        dst[i+3] = sv[0][3] * mul;
3883
    }
3884
}
3885

    
3886
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3887
                                int len)
3888
{
3889
    int i;
3890
    for (i = 0; i < len; i++) {
3891
        float t = v1[i] - v2[i];
3892
        v1[i] += v2[i];
3893
        v2[i] = t;
3894
    }
3895
}
3896

    
3897
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3898
{
3899
    float p = 0.0;
3900
    int i;
3901

    
3902
    for (i = 0; i < len; i++)
3903
        p += v1[i] * v2[i];
3904

    
3905
    return p;
3906
}
3907

    
3908
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3909
    int i;
3910
    for(i=0; i<len; i++)
3911
        dst[i] = src[i] * mul;
3912
}
3913

    
3914
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3915
                   uint32_t maxi, uint32_t maxisign)
3916
{
3917

    
3918
    if(a > mini) return mini;
3919
    else if((a^(1<<31)) > maxisign) return maxi;
3920
    else return a;
3921
}
3922

    
3923
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3924
    int i;
3925
    uint32_t mini = *(uint32_t*)min;
3926
    uint32_t maxi = *(uint32_t*)max;
3927
    uint32_t maxisign = maxi ^ (1<<31);
3928
    uint32_t *dsti = (uint32_t*)dst;
3929
    const uint32_t *srci = (const uint32_t*)src;
3930
    for(i=0; i<len; i+=8) {
3931
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3932
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3933
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3934
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3935
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3936
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3937
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3938
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3939
    }
3940
}
3941
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3942
    int i;
3943
    if(min < 0 && max > 0) {
3944
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3945
    } else {
3946
        for(i=0; i < len; i+=8) {
3947
            dst[i    ] = av_clipf(src[i    ], min, max);
3948
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3949
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3950
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3951
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3952
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3953
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3954
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3955
        }
3956
    }
3957
}
3958

    
3959
static av_always_inline int float_to_int16_one(const float *src){
3960
    int_fast32_t tmp = *(const int32_t*)src;
3961
    if(tmp & 0xf0000){
3962
        tmp = (0x43c0ffff - tmp)>>31;
3963
        // is this faster on some gcc/cpu combinations?
3964
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3965
//      else                 tmp = 0;
3966
    }
3967
    return tmp - 0x8000;
3968
}
3969

    
3970
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3971
    int i;
3972
    for(i=0; i<len; i++)
3973
        dst[i] = float_to_int16_one(src+i);
3974
}
3975

    
3976
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3977
    int i,j,c;
3978
    if(channels==2){
3979
        for(i=0; i<len; i++){
3980
            dst[2*i]   = float_to_int16_one(src[0]+i);
3981
            dst[2*i+1] = float_to_int16_one(src[1]+i);
3982
        }
3983
    }else{
3984
        for(c=0; c<channels; c++)
3985
            for(i=0, j=c; i<len; i++, j+=channels)
3986
                dst[j] = float_to_int16_one(src[c]+i);
3987
    }
3988
}
3989

    
3990
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3991
{
3992
    int res = 0;
3993

    
3994
    while (order--)
3995
        res += (*v1++ * *v2++) >> shift;
3996

    
3997
    return res;
3998
}
3999

    
4000
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4001
{
4002
    int res = 0;
4003
    while (order--) {
4004
        res   += *v1 * *v2++;
4005
        *v1++ += mul * *v3++;
4006
    }
4007
    return res;
4008
}
4009

    
4010
#define W0 2048
4011
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4012
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4013
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4014
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4015
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4016
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4017
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4018

    
4019
static void wmv2_idct_row(short * b)
4020
{
4021
    int s1,s2;
4022
    int a0,a1,a2,a3,a4,a5,a6,a7;
4023
    /*step 1*/
4024
    a1 = W1*b[1]+W7*b[7];
4025
    a7 = W7*b[1]-W1*b[7];
4026
    a5 = W5*b[5]+W3*b[3];
4027
    a3 = W3*b[5]-W5*b[3];
4028
    a2 = W2*b[2]+W6*b[6];
4029
    a6 = W6*b[2]-W2*b[6];
4030
    a0 = W0*b[0]+W0*b[4];
4031
    a4 = W0*b[0]-W0*b[4];
4032
    /*step 2*/
4033
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4034
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4035
    /*step 3*/
4036
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4037
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4038
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4039
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4040
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4041
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4042
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4043
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4044
}
4045
static void wmv2_idct_col(short * b)
4046
{
4047
    int s1,s2;
4048
    int a0,a1,a2,a3,a4,a5,a6,a7;
4049
    /*step 1, with extended precision*/
4050
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4051
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4052
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4053
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4054
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4055
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4056
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4057
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4058
    /*step 2*/
4059
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4060
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4061
    /*step 3*/
4062
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4063
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4064
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4065
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4066

    
4067
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4068
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4069
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4070
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4071
}
4072
void ff_wmv2_idct_c(short * block){
4073
    int i;
4074

    
4075
    for(i=0;i<64;i+=8){
4076
        wmv2_idct_row(block+i);
4077
    }
4078
    for(i=0;i<8;i++){
4079
        wmv2_idct_col(block+i);
4080
    }
4081
}
4082
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4083
 converted */
4084
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4085
{
4086
    ff_wmv2_idct_c(block);
4087
    put_pixels_clamped_c(block, dest, line_size);
4088
}
4089
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4090
{
4091
    ff_wmv2_idct_c(block);
4092
    add_pixels_clamped_c(block, dest, line_size);
4093
}
4094
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4095
{
4096
    j_rev_dct (block);
4097
    put_pixels_clamped_c(block, dest, line_size);
4098
}
4099
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4100
{
4101
    j_rev_dct (block);
4102
    add_pixels_clamped_c(block, dest, line_size);
4103
}
4104

    
4105
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4106
{
4107
    j_rev_dct4 (block);
4108
    put_pixels_clamped4_c(block, dest, line_size);
4109
}
4110
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4111
{
4112
    j_rev_dct4 (block);
4113
    add_pixels_clamped4_c(block, dest, line_size);
4114
}
4115

    
4116
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4117
{
4118
    j_rev_dct2 (block);
4119
    put_pixels_clamped2_c(block, dest, line_size);
4120
}
4121
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4122
{
4123
    j_rev_dct2 (block);
4124
    add_pixels_clamped2_c(block, dest, line_size);
4125
}
4126

    
4127
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4128
{
4129
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4130

    
4131
    dest[0] = cm[(block[0] + 4)>>3];
4132
}
4133
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4134
{
4135
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4136

    
4137
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4138
}
4139

    
4140
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4141

    
4142
/* init static data */
4143
av_cold void dsputil_static_init(void)
4144
{
4145
    int i;
4146

    
4147
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4148
    for(i=0;i<MAX_NEG_CROP;i++) {
4149
        ff_cropTbl[i] = 0;
4150
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4151
    }
4152

    
4153
    for(i=0;i<512;i++) {
4154
        ff_squareTbl[i] = (i - 256) * (i - 256);
4155
    }
4156

    
4157
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4158
}
4159

    
4160
int ff_check_alignment(void){
4161
    static int did_fail=0;
4162
    DECLARE_ALIGNED(16, int, aligned);
4163

    
4164
    if((intptr_t)&aligned & 15){
4165
        if(!did_fail){
4166
#if HAVE_MMX || HAVE_ALTIVEC
4167
            av_log(NULL, AV_LOG_ERROR,
4168
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4169
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4170
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4171
                "Do not report crashes to FFmpeg developers.\n");
4172
#endif
4173
            did_fail=1;
4174
        }
4175
        return -1;
4176
    }
4177
    return 0;
4178
}
4179

    
4180
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4181
{
4182
    int i;
4183

    
4184
    ff_check_alignment();
4185

    
4186
#if CONFIG_ENCODERS
4187
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4188
        c->fdct = fdct_ifast;
4189
        c->fdct248 = fdct_ifast248;
4190
    }
4191
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4192
        c->fdct = ff_faandct;
4193
        c->fdct248 = ff_faandct248;
4194
    }
4195
    else {
4196
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4197
        c->fdct248 = ff_fdct248_islow;
4198
    }
4199
#endif //CONFIG_ENCODERS
4200

    
4201
    if(avctx->lowres==1){
4202
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4203
            c->idct_put= ff_jref_idct4_put;
4204
            c->idct_add= ff_jref_idct4_add;
4205
        }else{
4206
            c->idct_put= ff_h264_lowres_idct_put_c;
4207
            c->idct_add= ff_h264_lowres_idct_add_c;
4208
        }
4209
        c->idct    = j_rev_dct4;
4210
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4211
    }else if(avctx->lowres==2){
4212
        c->idct_put= ff_jref_idct2_put;
4213
        c->idct_add= ff_jref_idct2_add;
4214
        c->idct    = j_rev_dct2;
4215
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4216
    }else if(avctx->lowres==3){
4217
        c->idct_put= ff_jref_idct1_put;
4218
        c->idct_add= ff_jref_idct1_add;
4219
        c->idct    = j_rev_dct1;
4220
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4221
    }else{
4222
        if(avctx->idct_algo==FF_IDCT_INT){
4223
            c->idct_put= ff_jref_idct_put;
4224
            c->idct_add= ff_jref_idct_add;
4225
            c->idct    = j_rev_dct;
4226
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4227
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4228
                avctx->idct_algo==FF_IDCT_VP3){
4229
            c->idct_put= ff_vp3_idct_put_c;
4230
            c->idct_add= ff_vp3_idct_add_c;
4231
            c->idct    = ff_vp3_idct_c;
4232
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4233
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4234
            c->idct_put= ff_wmv2_idct_put_c;
4235
            c->idct_add= ff_wmv2_idct_add_c;
4236
            c->idct    = ff_wmv2_idct_c;
4237
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4238
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4239
            c->idct_put= ff_faanidct_put;
4240
            c->idct_add= ff_faanidct_add;
4241
            c->idct    = ff_faanidct;
4242
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4243
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4244
            c->idct_put= ff_ea_idct_put_c;
4245
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4246
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4247
            c->idct     = ff_bink_idct_c;
4248
            c->idct_add = ff_bink_idct_add_c;
4249
            c->idct_put = ff_bink_idct_put_c;
4250
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4251
        }else{ //accurate/default
4252
            c->idct_put= ff_simple_idct_put;