Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ d6f8476b

History | View | Annotate | Download (161 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "avcodec.h"
31
#include "dsputil.h"
32
#include "simple_idct.h"
33
#include "faandct.h"
34
#include "faanidct.h"
35
#include "mathops.h"
36
#include "mpegvideo.h"
37
#include "config.h"
38
#include "lpc.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42
#include "vp8dsp.h"
43

    
44
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45
uint32_t ff_squareTbl[512] = {0, };
46

    
47
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48
#define pb_7f (~0UL/255 * 0x7f)
49
#define pb_80 (~0UL/255 * 0x80)
50

    
51
const uint8_t ff_zigzag_direct[64] = {
52
    0,   1,  8, 16,  9,  2,  3, 10,
53
    17, 24, 32, 25, 18, 11,  4,  5,
54
    12, 19, 26, 33, 40, 48, 41, 34,
55
    27, 20, 13,  6,  7, 14, 21, 28,
56
    35, 42, 49, 56, 57, 50, 43, 36,
57
    29, 22, 15, 23, 30, 37, 44, 51,
58
    58, 59, 52, 45, 38, 31, 39, 46,
59
    53, 60, 61, 54, 47, 55, 62, 63
60
};
61

    
62
/* Specific zigzag scan for 248 idct. NOTE that unlike the
63
   specification, we interleave the fields */
64
const uint8_t ff_zigzag248_direct[64] = {
65
     0,  8,  1,  9, 16, 24,  2, 10,
66
    17, 25, 32, 40, 48, 56, 33, 41,
67
    18, 26,  3, 11,  4, 12, 19, 27,
68
    34, 42, 49, 57, 50, 58, 35, 43,
69
    20, 28,  5, 13,  6, 14, 21, 29,
70
    36, 44, 51, 59, 52, 60, 37, 45,
71
    22, 30,  7, 15, 23, 31, 38, 46,
72
    53, 61, 54, 62, 39, 47, 55, 63,
73
};
74

    
75
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77

    
78
const uint8_t ff_alternate_horizontal_scan[64] = {
79
    0,  1,   2,  3,  8,  9, 16, 17,
80
    10, 11,  4,  5,  6,  7, 15, 14,
81
    13, 12, 19, 18, 24, 25, 32, 33,
82
    26, 27, 20, 21, 22, 23, 28, 29,
83
    30, 31, 34, 35, 40, 41, 48, 49,
84
    42, 43, 36, 37, 38, 39, 44, 45,
85
    46, 47, 50, 51, 56, 57, 58, 59,
86
    52, 53, 54, 55, 60, 61, 62, 63,
87
};
88

    
89
const uint8_t ff_alternate_vertical_scan[64] = {
90
    0,  8,  16, 24,  1,  9,  2, 10,
91
    17, 25, 32, 40, 48, 56, 57, 49,
92
    41, 33, 26, 18,  3, 11,  4, 12,
93
    19, 27, 34, 42, 50, 58, 35, 43,
94
    51, 59, 20, 28,  5, 13,  6, 14,
95
    21, 29, 36, 44, 52, 60, 37, 45,
96
    53, 61, 22, 30,  7, 15, 23, 31,
97
    38, 46, 54, 62, 39, 47, 55, 63,
98
};
99

    
100
/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
101
 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
102
const uint32_t ff_inverse[257]={
103
         0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
104
 536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
105
 268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
106
 178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
107
 134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
108
 107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
109
  89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
110
  76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
111
  67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
112
  59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
113
  53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
114
  48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
115
  44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
116
  41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
117
  38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
118
  35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
119
  33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
120
  31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
121
  29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
122
  28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
123
  26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
124
  25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
125
  24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
126
  23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
127
  22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
128
  21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
129
  20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
130
  19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
131
  19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
132
  18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
133
  17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
134
  17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
135
  16777216
136
};
137

    
138
/* Input permutation for the simple_idct_mmx */
139
static const uint8_t simple_mmx_permutation[64]={
140
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
141
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
142
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
143
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
144
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
145
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
146
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
147
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
148
};
149

    
150
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
151

    
152
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
153
    int i;
154
    int end;
155

    
156
    st->scantable= src_scantable;
157

    
158
    for(i=0; i<64; i++){
159
        int j;
160
        j = src_scantable[i];
161
        st->permutated[i] = permutation[j];
162
#if ARCH_PPC
163
        st->inverse[j] = i;
164
#endif
165
    }
166

    
167
    end=-1;
168
    for(i=0; i<64; i++){
169
        int j;
170
        j = st->permutated[i];
171
        if(j>end) end=j;
172
        st->raster_end[i]= end;
173
    }
174
}
175

    
176
static int pix_sum_c(uint8_t * pix, int line_size)
177
{
178
    int s, i, j;
179

    
180
    s = 0;
181
    for (i = 0; i < 16; i++) {
182
        for (j = 0; j < 16; j += 8) {
183
            s += pix[0];
184
            s += pix[1];
185
            s += pix[2];
186
            s += pix[3];
187
            s += pix[4];
188
            s += pix[5];
189
            s += pix[6];
190
            s += pix[7];
191
            pix += 8;
192
        }
193
        pix += line_size - 16;
194
    }
195
    return s;
196
}
197

    
198
static int pix_norm1_c(uint8_t * pix, int line_size)
199
{
200
    int s, i, j;
201
    uint32_t *sq = ff_squareTbl + 256;
202

    
203
    s = 0;
204
    for (i = 0; i < 16; i++) {
205
        for (j = 0; j < 16; j += 8) {
206
#if 0
207
            s += sq[pix[0]];
208
            s += sq[pix[1]];
209
            s += sq[pix[2]];
210
            s += sq[pix[3]];
211
            s += sq[pix[4]];
212
            s += sq[pix[5]];
213
            s += sq[pix[6]];
214
            s += sq[pix[7]];
215
#else
216
#if LONG_MAX > 2147483647
217
            register uint64_t x=*(uint64_t*)pix;
218
            s += sq[x&0xff];
219
            s += sq[(x>>8)&0xff];
220
            s += sq[(x>>16)&0xff];
221
            s += sq[(x>>24)&0xff];
222
            s += sq[(x>>32)&0xff];
223
            s += sq[(x>>40)&0xff];
224
            s += sq[(x>>48)&0xff];
225
            s += sq[(x>>56)&0xff];
226
#else
227
            register uint32_t x=*(uint32_t*)pix;
228
            s += sq[x&0xff];
229
            s += sq[(x>>8)&0xff];
230
            s += sq[(x>>16)&0xff];
231
            s += sq[(x>>24)&0xff];
232
            x=*(uint32_t*)(pix+4);
233
            s += sq[x&0xff];
234
            s += sq[(x>>8)&0xff];
235
            s += sq[(x>>16)&0xff];
236
            s += sq[(x>>24)&0xff];
237
#endif
238
#endif
239
            pix += 8;
240
        }
241
        pix += line_size - 16;
242
    }
243
    return s;
244
}
245

    
246
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
247
    int i;
248

    
249
    for(i=0; i+8<=w; i+=8){
250
        dst[i+0]= bswap_32(src[i+0]);
251
        dst[i+1]= bswap_32(src[i+1]);
252
        dst[i+2]= bswap_32(src[i+2]);
253
        dst[i+3]= bswap_32(src[i+3]);
254
        dst[i+4]= bswap_32(src[i+4]);
255
        dst[i+5]= bswap_32(src[i+5]);
256
        dst[i+6]= bswap_32(src[i+6]);
257
        dst[i+7]= bswap_32(src[i+7]);
258
    }
259
    for(;i<w; i++){
260
        dst[i+0]= bswap_32(src[i+0]);
261
    }
262
}
263

    
264
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
265
{
266
    int s, i;
267
    uint32_t *sq = ff_squareTbl + 256;
268

    
269
    s = 0;
270
    for (i = 0; i < h; i++) {
271
        s += sq[pix1[0] - pix2[0]];
272
        s += sq[pix1[1] - pix2[1]];
273
        s += sq[pix1[2] - pix2[2]];
274
        s += sq[pix1[3] - pix2[3]];
275
        pix1 += line_size;
276
        pix2 += line_size;
277
    }
278
    return s;
279
}
280

    
281
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
282
{
283
    int s, i;
284
    uint32_t *sq = ff_squareTbl + 256;
285

    
286
    s = 0;
287
    for (i = 0; i < h; i++) {
288
        s += sq[pix1[0] - pix2[0]];
289
        s += sq[pix1[1] - pix2[1]];
290
        s += sq[pix1[2] - pix2[2]];
291
        s += sq[pix1[3] - pix2[3]];
292
        s += sq[pix1[4] - pix2[4]];
293
        s += sq[pix1[5] - pix2[5]];
294
        s += sq[pix1[6] - pix2[6]];
295
        s += sq[pix1[7] - pix2[7]];
296
        pix1 += line_size;
297
        pix2 += line_size;
298
    }
299
    return s;
300
}
301

    
302
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
303
{
304
    int s, i;
305
    uint32_t *sq = ff_squareTbl + 256;
306

    
307
    s = 0;
308
    for (i = 0; i < h; i++) {
309
        s += sq[pix1[ 0] - pix2[ 0]];
310
        s += sq[pix1[ 1] - pix2[ 1]];
311
        s += sq[pix1[ 2] - pix2[ 2]];
312
        s += sq[pix1[ 3] - pix2[ 3]];
313
        s += sq[pix1[ 4] - pix2[ 4]];
314
        s += sq[pix1[ 5] - pix2[ 5]];
315
        s += sq[pix1[ 6] - pix2[ 6]];
316
        s += sq[pix1[ 7] - pix2[ 7]];
317
        s += sq[pix1[ 8] - pix2[ 8]];
318
        s += sq[pix1[ 9] - pix2[ 9]];
319
        s += sq[pix1[10] - pix2[10]];
320
        s += sq[pix1[11] - pix2[11]];
321
        s += sq[pix1[12] - pix2[12]];
322
        s += sq[pix1[13] - pix2[13]];
323
        s += sq[pix1[14] - pix2[14]];
324
        s += sq[pix1[15] - pix2[15]];
325

    
326
        pix1 += line_size;
327
        pix2 += line_size;
328
    }
329
    return s;
330
}
331

    
332
/* draw the edges of width 'w' of an image of size width, height */
333
//FIXME check that this is ok for mpeg4 interlaced
334
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
335
{
336
    uint8_t *ptr, *last_line;
337
    int i;
338

    
339
    last_line = buf + (height - 1) * wrap;
340
    for(i=0;i<w;i++) {
341
        /* top and bottom */
342
        memcpy(buf - (i + 1) * wrap, buf, width);
343
        memcpy(last_line + (i + 1) * wrap, last_line, width);
344
    }
345
    /* left and right */
346
    ptr = buf;
347
    for(i=0;i<height;i++) {
348
        memset(ptr - w, ptr[0], w);
349
        memset(ptr + width, ptr[width-1], w);
350
        ptr += wrap;
351
    }
352
    /* corners */
353
    for(i=0;i<w;i++) {
354
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
355
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
356
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
357
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
358
    }
359
}
360

    
361
/**
362
 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
363
 * @param buf destination buffer
364
 * @param src source buffer
365
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
366
 * @param block_w width of block
367
 * @param block_h height of block
368
 * @param src_x x coordinate of the top left sample of the block in the source buffer
369
 * @param src_y y coordinate of the top left sample of the block in the source buffer
370
 * @param w width of the source buffer
371
 * @param h height of the source buffer
372
 */
373
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
374
                                    int src_x, int src_y, int w, int h){
375
    int x, y;
376
    int start_y, start_x, end_y, end_x;
377

    
378
    if(src_y>= h){
379
        src+= (h-1-src_y)*linesize;
380
        src_y=h-1;
381
    }else if(src_y<=-block_h){
382
        src+= (1-block_h-src_y)*linesize;
383
        src_y=1-block_h;
384
    }
385
    if(src_x>= w){
386
        src+= (w-1-src_x);
387
        src_x=w-1;
388
    }else if(src_x<=-block_w){
389
        src+= (1-block_w-src_x);
390
        src_x=1-block_w;
391
    }
392

    
393
    start_y= FFMAX(0, -src_y);
394
    start_x= FFMAX(0, -src_x);
395
    end_y= FFMIN(block_h, h-src_y);
396
    end_x= FFMIN(block_w, w-src_x);
397

    
398
    // copy existing part
399
    for(y=start_y; y<end_y; y++){
400
        for(x=start_x; x<end_x; x++){
401
            buf[x + y*linesize]= src[x + y*linesize];
402
        }
403
    }
404

    
405
    //top
406
    for(y=0; y<start_y; y++){
407
        for(x=start_x; x<end_x; x++){
408
            buf[x + y*linesize]= buf[x + start_y*linesize];
409
        }
410
    }
411

    
412
    //bottom
413
    for(y=end_y; y<block_h; y++){
414
        for(x=start_x; x<end_x; x++){
415
            buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
416
        }
417
    }
418

    
419
    for(y=0; y<block_h; y++){
420
       //left
421
        for(x=0; x<start_x; x++){
422
            buf[x + y*linesize]= buf[start_x + y*linesize];
423
        }
424

    
425
       //right
426
        for(x=end_x; x<block_w; x++){
427
            buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
428
        }
429
    }
430
}
431

    
432
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
433
{
434
    int i;
435

    
436
    /* read the pixels */
437
    for(i=0;i<8;i++) {
438
        block[0] = pixels[0];
439
        block[1] = pixels[1];
440
        block[2] = pixels[2];
441
        block[3] = pixels[3];
442
        block[4] = pixels[4];
443
        block[5] = pixels[5];
444
        block[6] = pixels[6];
445
        block[7] = pixels[7];
446
        pixels += line_size;
447
        block += 8;
448
    }
449
}
450

    
451
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
452
                          const uint8_t *s2, int stride){
453
    int i;
454

    
455
    /* read the pixels */
456
    for(i=0;i<8;i++) {
457
        block[0] = s1[0] - s2[0];
458
        block[1] = s1[1] - s2[1];
459
        block[2] = s1[2] - s2[2];
460
        block[3] = s1[3] - s2[3];
461
        block[4] = s1[4] - s2[4];
462
        block[5] = s1[5] - s2[5];
463
        block[6] = s1[6] - s2[6];
464
        block[7] = s1[7] - s2[7];
465
        s1 += stride;
466
        s2 += stride;
467
        block += 8;
468
    }
469
}
470

    
471

    
472
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
473
                                 int line_size)
474
{
475
    int i;
476
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
477

    
478
    /* read the pixels */
479
    for(i=0;i<8;i++) {
480
        pixels[0] = cm[block[0]];
481
        pixels[1] = cm[block[1]];
482
        pixels[2] = cm[block[2]];
483
        pixels[3] = cm[block[3]];
484
        pixels[4] = cm[block[4]];
485
        pixels[5] = cm[block[5]];
486
        pixels[6] = cm[block[6]];
487
        pixels[7] = cm[block[7]];
488

    
489
        pixels += line_size;
490
        block += 8;
491
    }
492
}
493

    
494
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
495
                                 int line_size)
496
{
497
    int i;
498
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
499

    
500
    /* read the pixels */
501
    for(i=0;i<4;i++) {
502
        pixels[0] = cm[block[0]];
503
        pixels[1] = cm[block[1]];
504
        pixels[2] = cm[block[2]];
505
        pixels[3] = cm[block[3]];
506

    
507
        pixels += line_size;
508
        block += 8;
509
    }
510
}
511

    
512
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
513
                                 int line_size)
514
{
515
    int i;
516
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
517

    
518
    /* read the pixels */
519
    for(i=0;i<2;i++) {
520
        pixels[0] = cm[block[0]];
521
        pixels[1] = cm[block[1]];
522

    
523
        pixels += line_size;
524
        block += 8;
525
    }
526
}
527

    
528
static void put_signed_pixels_clamped_c(const DCTELEM *block,
529
                                        uint8_t *restrict pixels,
530
                                        int line_size)
531
{
532
    int i, j;
533

    
534
    for (i = 0; i < 8; i++) {
535
        for (j = 0; j < 8; j++) {
536
            if (*block < -128)
537
                *pixels = 0;
538
            else if (*block > 127)
539
                *pixels = 255;
540
            else
541
                *pixels = (uint8_t)(*block + 128);
542
            block++;
543
            pixels++;
544
        }
545
        pixels += (line_size - 8);
546
    }
547
}
548

    
549
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
550
                                    int line_size)
551
{
552
    int i;
553

    
554
    /* read the pixels */
555
    for(i=0;i<8;i++) {
556
        pixels[0] = block[0];
557
        pixels[1] = block[1];
558
        pixels[2] = block[2];
559
        pixels[3] = block[3];
560
        pixels[4] = block[4];
561
        pixels[5] = block[5];
562
        pixels[6] = block[6];
563
        pixels[7] = block[7];
564

    
565
        pixels += line_size;
566
        block += 8;
567
    }
568
}
569

    
570
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
571
                          int line_size)
572
{
573
    int i;
574
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
575

    
576
    /* read the pixels */
577
    for(i=0;i<8;i++) {
578
        pixels[0] = cm[pixels[0] + block[0]];
579
        pixels[1] = cm[pixels[1] + block[1]];
580
        pixels[2] = cm[pixels[2] + block[2]];
581
        pixels[3] = cm[pixels[3] + block[3]];
582
        pixels[4] = cm[pixels[4] + block[4]];
583
        pixels[5] = cm[pixels[5] + block[5]];
584
        pixels[6] = cm[pixels[6] + block[6]];
585
        pixels[7] = cm[pixels[7] + block[7]];
586
        pixels += line_size;
587
        block += 8;
588
    }
589
}
590

    
591
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
592
                          int line_size)
593
{
594
    int i;
595
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
596

    
597
    /* read the pixels */
598
    for(i=0;i<4;i++) {
599
        pixels[0] = cm[pixels[0] + block[0]];
600
        pixels[1] = cm[pixels[1] + block[1]];
601
        pixels[2] = cm[pixels[2] + block[2]];
602
        pixels[3] = cm[pixels[3] + block[3]];
603
        pixels += line_size;
604
        block += 8;
605
    }
606
}
607

    
608
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
609
                          int line_size)
610
{
611
    int i;
612
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
613

    
614
    /* read the pixels */
615
    for(i=0;i<2;i++) {
616
        pixels[0] = cm[pixels[0] + block[0]];
617
        pixels[1] = cm[pixels[1] + block[1]];
618
        pixels += line_size;
619
        block += 8;
620
    }
621
}
622

    
623
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
624
{
625
    int i;
626
    for(i=0;i<8;i++) {
627
        pixels[0] += block[0];
628
        pixels[1] += block[1];
629
        pixels[2] += block[2];
630
        pixels[3] += block[3];
631
        pixels[4] += block[4];
632
        pixels[5] += block[5];
633
        pixels[6] += block[6];
634
        pixels[7] += block[7];
635
        pixels += line_size;
636
        block += 8;
637
    }
638
}
639

    
640
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
641
{
642
    int i;
643
    for(i=0;i<4;i++) {
644
        pixels[0] += block[0];
645
        pixels[1] += block[1];
646
        pixels[2] += block[2];
647
        pixels[3] += block[3];
648
        pixels += line_size;
649
        block += 4;
650
    }
651
}
652

    
653
static int sum_abs_dctelem_c(DCTELEM *block)
654
{
655
    int sum=0, i;
656
    for(i=0; i<64; i++)
657
        sum+= FFABS(block[i]);
658
    return sum;
659
}
660

    
661
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
662
{
663
    int i;
664

    
665
    for (i = 0; i < h; i++) {
666
        memset(block, value, 16);
667
        block += line_size;
668
    }
669
}
670

    
671
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
672
{
673
    int i;
674

    
675
    for (i = 0; i < h; i++) {
676
        memset(block, value, 8);
677
        block += line_size;
678
    }
679
}
680

    
681
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
682
{
683
    int i, j;
684
    uint16_t *dst1 = (uint16_t *) dst;
685
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
686

    
687
    for (j = 0; j < 8; j++) {
688
        for (i = 0; i < 8; i++) {
689
            dst1[i] = dst2[i] = src[i] * 0x0101;
690
        }
691
        src  += 8;
692
        dst1 += linesize;
693
        dst2 += linesize;
694
    }
695
}
696

    
697
#if 0
698

699
#define PIXOP2(OPNAME, OP) \
700
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
701
{\
702
    int i;\
703
    for(i=0; i<h; i++){\
704
        OP(*((uint64_t*)block), AV_RN64(pixels));\
705
        pixels+=line_size;\
706
        block +=line_size;\
707
    }\
708
}\
709
\
710
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
711
{\
712
    int i;\
713
    for(i=0; i<h; i++){\
714
        const uint64_t a= AV_RN64(pixels  );\
715
        const uint64_t b= AV_RN64(pixels+1);\
716
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
717
        pixels+=line_size;\
718
        block +=line_size;\
719
    }\
720
}\
721
\
722
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
723
{\
724
    int i;\
725
    for(i=0; i<h; i++){\
726
        const uint64_t a= AV_RN64(pixels  );\
727
        const uint64_t b= AV_RN64(pixels+1);\
728
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
729
        pixels+=line_size;\
730
        block +=line_size;\
731
    }\
732
}\
733
\
734
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
735
{\
736
    int i;\
737
    for(i=0; i<h; i++){\
738
        const uint64_t a= AV_RN64(pixels          );\
739
        const uint64_t b= AV_RN64(pixels+line_size);\
740
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
741
        pixels+=line_size;\
742
        block +=line_size;\
743
    }\
744
}\
745
\
746
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
747
{\
748
    int i;\
749
    for(i=0; i<h; i++){\
750
        const uint64_t a= AV_RN64(pixels          );\
751
        const uint64_t b= AV_RN64(pixels+line_size);\
752
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
753
        pixels+=line_size;\
754
        block +=line_size;\
755
    }\
756
}\
757
\
758
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
759
{\
760
        int i;\
761
        const uint64_t a= AV_RN64(pixels  );\
762
        const uint64_t b= AV_RN64(pixels+1);\
763
        uint64_t l0=  (a&0x0303030303030303ULL)\
764
                    + (b&0x0303030303030303ULL)\
765
                    + 0x0202020202020202ULL;\
766
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
767
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
768
        uint64_t l1,h1;\
769
\
770
        pixels+=line_size;\
771
        for(i=0; i<h; i+=2){\
772
            uint64_t a= AV_RN64(pixels  );\
773
            uint64_t b= AV_RN64(pixels+1);\
774
            l1=  (a&0x0303030303030303ULL)\
775
               + (b&0x0303030303030303ULL);\
776
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
777
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
778
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
779
            pixels+=line_size;\
780
            block +=line_size;\
781
            a= AV_RN64(pixels  );\
782
            b= AV_RN64(pixels+1);\
783
            l0=  (a&0x0303030303030303ULL)\
784
               + (b&0x0303030303030303ULL)\
785
               + 0x0202020202020202ULL;\
786
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
787
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
788
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
789
            pixels+=line_size;\
790
            block +=line_size;\
791
        }\
792
}\
793
\
794
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
795
{\
796
        int i;\
797
        const uint64_t a= AV_RN64(pixels  );\
798
        const uint64_t b= AV_RN64(pixels+1);\
799
        uint64_t l0=  (a&0x0303030303030303ULL)\
800
                    + (b&0x0303030303030303ULL)\
801
                    + 0x0101010101010101ULL;\
802
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
803
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
804
        uint64_t l1,h1;\
805
\
806
        pixels+=line_size;\
807
        for(i=0; i<h; i+=2){\
808
            uint64_t a= AV_RN64(pixels  );\
809
            uint64_t b= AV_RN64(pixels+1);\
810
            l1=  (a&0x0303030303030303ULL)\
811
               + (b&0x0303030303030303ULL);\
812
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
813
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
814
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
815
            pixels+=line_size;\
816
            block +=line_size;\
817
            a= AV_RN64(pixels  );\
818
            b= AV_RN64(pixels+1);\
819
            l0=  (a&0x0303030303030303ULL)\
820
               + (b&0x0303030303030303ULL)\
821
               + 0x0101010101010101ULL;\
822
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
823
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
824
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
825
            pixels+=line_size;\
826
            block +=line_size;\
827
        }\
828
}\
829
\
830
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
831
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
832
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
833
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
834
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
835
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
836
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
837

838
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
839
#else // 64 bit variant
840

    
841
#define PIXOP2(OPNAME, OP) \
842
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
843
    int i;\
844
    for(i=0; i<h; i++){\
845
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
846
        pixels+=line_size;\
847
        block +=line_size;\
848
    }\
849
}\
850
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851
    int i;\
852
    for(i=0; i<h; i++){\
853
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
854
        pixels+=line_size;\
855
        block +=line_size;\
856
    }\
857
}\
858
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859
    int i;\
860
    for(i=0; i<h; i++){\
861
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
862
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
863
        pixels+=line_size;\
864
        block +=line_size;\
865
    }\
866
}\
867
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
868
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
869
}\
870
\
871
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
872
                                                int src_stride1, int src_stride2, int h){\
873
    int i;\
874
    for(i=0; i<h; i++){\
875
        uint32_t a,b;\
876
        a= AV_RN32(&src1[i*src_stride1  ]);\
877
        b= AV_RN32(&src2[i*src_stride2  ]);\
878
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
879
        a= AV_RN32(&src1[i*src_stride1+4]);\
880
        b= AV_RN32(&src2[i*src_stride2+4]);\
881
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
882
    }\
883
}\
884
\
885
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
886
                                                int src_stride1, int src_stride2, int h){\
887
    int i;\
888
    for(i=0; i<h; i++){\
889
        uint32_t a,b;\
890
        a= AV_RN32(&src1[i*src_stride1  ]);\
891
        b= AV_RN32(&src2[i*src_stride2  ]);\
892
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
893
        a= AV_RN32(&src1[i*src_stride1+4]);\
894
        b= AV_RN32(&src2[i*src_stride2+4]);\
895
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
896
    }\
897
}\
898
\
899
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
900
                                                int src_stride1, int src_stride2, int h){\
901
    int i;\
902
    for(i=0; i<h; i++){\
903
        uint32_t a,b;\
904
        a= AV_RN32(&src1[i*src_stride1  ]);\
905
        b= AV_RN32(&src2[i*src_stride2  ]);\
906
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
907
    }\
908
}\
909
\
910
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
911
                                                int src_stride1, int src_stride2, int h){\
912
    int i;\
913
    for(i=0; i<h; i++){\
914
        uint32_t a,b;\
915
        a= AV_RN16(&src1[i*src_stride1  ]);\
916
        b= AV_RN16(&src2[i*src_stride2  ]);\
917
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
918
    }\
919
}\
920
\
921
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922
                                                int src_stride1, int src_stride2, int h){\
923
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
924
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
925
}\
926
\
927
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
928
                                                int src_stride1, int src_stride2, int h){\
929
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
930
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
931
}\
932
\
933
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
934
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
935
}\
936
\
937
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
938
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
939
}\
940
\
941
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
942
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
943
}\
944
\
945
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
946
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
947
}\
948
\
949
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
950
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
951
    int i;\
952
    for(i=0; i<h; i++){\
953
        uint32_t a, b, c, d, l0, l1, h0, h1;\
954
        a= AV_RN32(&src1[i*src_stride1]);\
955
        b= AV_RN32(&src2[i*src_stride2]);\
956
        c= AV_RN32(&src3[i*src_stride3]);\
957
        d= AV_RN32(&src4[i*src_stride4]);\
958
        l0=  (a&0x03030303UL)\
959
           + (b&0x03030303UL)\
960
           + 0x02020202UL;\
961
        h0= ((a&0xFCFCFCFCUL)>>2)\
962
          + ((b&0xFCFCFCFCUL)>>2);\
963
        l1=  (c&0x03030303UL)\
964
           + (d&0x03030303UL);\
965
        h1= ((c&0xFCFCFCFCUL)>>2)\
966
          + ((d&0xFCFCFCFCUL)>>2);\
967
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
968
        a= AV_RN32(&src1[i*src_stride1+4]);\
969
        b= AV_RN32(&src2[i*src_stride2+4]);\
970
        c= AV_RN32(&src3[i*src_stride3+4]);\
971
        d= AV_RN32(&src4[i*src_stride4+4]);\
972
        l0=  (a&0x03030303UL)\
973
           + (b&0x03030303UL)\
974
           + 0x02020202UL;\
975
        h0= ((a&0xFCFCFCFCUL)>>2)\
976
          + ((b&0xFCFCFCFCUL)>>2);\
977
        l1=  (c&0x03030303UL)\
978
           + (d&0x03030303UL);\
979
        h1= ((c&0xFCFCFCFCUL)>>2)\
980
          + ((d&0xFCFCFCFCUL)>>2);\
981
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
982
    }\
983
}\
984
\
985
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
986
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987
}\
988
\
989
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
990
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
991
}\
992
\
993
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
994
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
995
}\
996
\
997
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
998
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
999
}\
1000
\
1001
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1002
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1003
    int i;\
1004
    for(i=0; i<h; i++){\
1005
        uint32_t a, b, c, d, l0, l1, h0, h1;\
1006
        a= AV_RN32(&src1[i*src_stride1]);\
1007
        b= AV_RN32(&src2[i*src_stride2]);\
1008
        c= AV_RN32(&src3[i*src_stride3]);\
1009
        d= AV_RN32(&src4[i*src_stride4]);\
1010
        l0=  (a&0x03030303UL)\
1011
           + (b&0x03030303UL)\
1012
           + 0x01010101UL;\
1013
        h0= ((a&0xFCFCFCFCUL)>>2)\
1014
          + ((b&0xFCFCFCFCUL)>>2);\
1015
        l1=  (c&0x03030303UL)\
1016
           + (d&0x03030303UL);\
1017
        h1= ((c&0xFCFCFCFCUL)>>2)\
1018
          + ((d&0xFCFCFCFCUL)>>2);\
1019
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1020
        a= AV_RN32(&src1[i*src_stride1+4]);\
1021
        b= AV_RN32(&src2[i*src_stride2+4]);\
1022
        c= AV_RN32(&src3[i*src_stride3+4]);\
1023
        d= AV_RN32(&src4[i*src_stride4+4]);\
1024
        l0=  (a&0x03030303UL)\
1025
           + (b&0x03030303UL)\
1026
           + 0x01010101UL;\
1027
        h0= ((a&0xFCFCFCFCUL)>>2)\
1028
          + ((b&0xFCFCFCFCUL)>>2);\
1029
        l1=  (c&0x03030303UL)\
1030
           + (d&0x03030303UL);\
1031
        h1= ((c&0xFCFCFCFCUL)>>2)\
1032
          + ((d&0xFCFCFCFCUL)>>2);\
1033
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034
    }\
1035
}\
1036
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1037
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1038
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1040
}\
1041
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1042
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1043
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1045
}\
1046
\
1047
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1048
{\
1049
        int i, a0, b0, a1, b1;\
1050
        a0= pixels[0];\
1051
        b0= pixels[1] + 2;\
1052
        a0 += b0;\
1053
        b0 += pixels[2];\
1054
\
1055
        pixels+=line_size;\
1056
        for(i=0; i<h; i+=2){\
1057
            a1= pixels[0];\
1058
            b1= pixels[1];\
1059
            a1 += b1;\
1060
            b1 += pixels[2];\
1061
\
1062
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1063
            block[1]= (b1+b0)>>2;\
1064
\
1065
            pixels+=line_size;\
1066
            block +=line_size;\
1067
\
1068
            a0= pixels[0];\
1069
            b0= pixels[1] + 2;\
1070
            a0 += b0;\
1071
            b0 += pixels[2];\
1072
\
1073
            block[0]= (a1+a0)>>2;\
1074
            block[1]= (b1+b0)>>2;\
1075
            pixels+=line_size;\
1076
            block +=line_size;\
1077
        }\
1078
}\
1079
\
1080
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1081
{\
1082
        int i;\
1083
        const uint32_t a= AV_RN32(pixels  );\
1084
        const uint32_t b= AV_RN32(pixels+1);\
1085
        uint32_t l0=  (a&0x03030303UL)\
1086
                    + (b&0x03030303UL)\
1087
                    + 0x02020202UL;\
1088
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089
                   + ((b&0xFCFCFCFCUL)>>2);\
1090
        uint32_t l1,h1;\
1091
\
1092
        pixels+=line_size;\
1093
        for(i=0; i<h; i+=2){\
1094
            uint32_t a= AV_RN32(pixels  );\
1095
            uint32_t b= AV_RN32(pixels+1);\
1096
            l1=  (a&0x03030303UL)\
1097
               + (b&0x03030303UL);\
1098
            h1= ((a&0xFCFCFCFCUL)>>2)\
1099
              + ((b&0xFCFCFCFCUL)>>2);\
1100
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101
            pixels+=line_size;\
1102
            block +=line_size;\
1103
            a= AV_RN32(pixels  );\
1104
            b= AV_RN32(pixels+1);\
1105
            l0=  (a&0x03030303UL)\
1106
               + (b&0x03030303UL)\
1107
               + 0x02020202UL;\
1108
            h0= ((a&0xFCFCFCFCUL)>>2)\
1109
              + ((b&0xFCFCFCFCUL)>>2);\
1110
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111
            pixels+=line_size;\
1112
            block +=line_size;\
1113
        }\
1114
}\
1115
\
1116
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1117
{\
1118
    int j;\
1119
    for(j=0; j<2; j++){\
1120
        int i;\
1121
        const uint32_t a= AV_RN32(pixels  );\
1122
        const uint32_t b= AV_RN32(pixels+1);\
1123
        uint32_t l0=  (a&0x03030303UL)\
1124
                    + (b&0x03030303UL)\
1125
                    + 0x02020202UL;\
1126
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1127
                   + ((b&0xFCFCFCFCUL)>>2);\
1128
        uint32_t l1,h1;\
1129
\
1130
        pixels+=line_size;\
1131
        for(i=0; i<h; i+=2){\
1132
            uint32_t a= AV_RN32(pixels  );\
1133
            uint32_t b= AV_RN32(pixels+1);\
1134
            l1=  (a&0x03030303UL)\
1135
               + (b&0x03030303UL);\
1136
            h1= ((a&0xFCFCFCFCUL)>>2)\
1137
              + ((b&0xFCFCFCFCUL)>>2);\
1138
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1139
            pixels+=line_size;\
1140
            block +=line_size;\
1141
            a= AV_RN32(pixels  );\
1142
            b= AV_RN32(pixels+1);\
1143
            l0=  (a&0x03030303UL)\
1144
               + (b&0x03030303UL)\
1145
               + 0x02020202UL;\
1146
            h0= ((a&0xFCFCFCFCUL)>>2)\
1147
              + ((b&0xFCFCFCFCUL)>>2);\
1148
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1149
            pixels+=line_size;\
1150
            block +=line_size;\
1151
        }\
1152
        pixels+=4-line_size*(h+1);\
1153
        block +=4-line_size*h;\
1154
    }\
1155
}\
1156
\
1157
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1158
{\
1159
    int j;\
1160
    for(j=0; j<2; j++){\
1161
        int i;\
1162
        const uint32_t a= AV_RN32(pixels  );\
1163
        const uint32_t b= AV_RN32(pixels+1);\
1164
        uint32_t l0=  (a&0x03030303UL)\
1165
                    + (b&0x03030303UL)\
1166
                    + 0x01010101UL;\
1167
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1168
                   + ((b&0xFCFCFCFCUL)>>2);\
1169
        uint32_t l1,h1;\
1170
\
1171
        pixels+=line_size;\
1172
        for(i=0; i<h; i+=2){\
1173
            uint32_t a= AV_RN32(pixels  );\
1174
            uint32_t b= AV_RN32(pixels+1);\
1175
            l1=  (a&0x03030303UL)\
1176
               + (b&0x03030303UL);\
1177
            h1= ((a&0xFCFCFCFCUL)>>2)\
1178
              + ((b&0xFCFCFCFCUL)>>2);\
1179
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1180
            pixels+=line_size;\
1181
            block +=line_size;\
1182
            a= AV_RN32(pixels  );\
1183
            b= AV_RN32(pixels+1);\
1184
            l0=  (a&0x03030303UL)\
1185
               + (b&0x03030303UL)\
1186
               + 0x01010101UL;\
1187
            h0= ((a&0xFCFCFCFCUL)>>2)\
1188
              + ((b&0xFCFCFCFCUL)>>2);\
1189
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1190
            pixels+=line_size;\
1191
            block +=line_size;\
1192
        }\
1193
        pixels+=4-line_size*(h+1);\
1194
        block +=4-line_size*h;\
1195
    }\
1196
}\
1197
\
1198
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1199
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1200
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1201
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1202
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1203
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1204
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1205
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1206

    
1207
#define op_avg(a, b) a = rnd_avg32(a, b)
1208
#endif
1209
#define op_put(a, b) a = b
1210

    
1211
PIXOP2(avg, op_avg)
1212
PIXOP2(put, op_put)
1213
#undef op_avg
1214
#undef op_put
1215

    
1216
#define avg2(a,b) ((a+b+1)>>1)
1217
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1218

    
1219
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1220
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1221
}
1222

    
1223
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1224
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1225
}
1226

    
1227
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1228
{
1229
    const int A=(16-x16)*(16-y16);
1230
    const int B=(   x16)*(16-y16);
1231
    const int C=(16-x16)*(   y16);
1232
    const int D=(   x16)*(   y16);
1233
    int i;
1234

    
1235
    for(i=0; i<h; i++)
1236
    {
1237
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1238
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1239
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1240
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1241
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1242
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1243
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1244
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1245
        dst+= stride;
1246
        src+= stride;
1247
    }
1248
}
1249

    
1250
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1251
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1252
{
1253
    int y, vx, vy;
1254
    const int s= 1<<shift;
1255

    
1256
    width--;
1257
    height--;
1258

    
1259
    for(y=0; y<h; y++){
1260
        int x;
1261

    
1262
        vx= ox;
1263
        vy= oy;
1264
        for(x=0; x<8; x++){ //XXX FIXME optimize
1265
            int src_x, src_y, frac_x, frac_y, index;
1266

    
1267
            src_x= vx>>16;
1268
            src_y= vy>>16;
1269
            frac_x= src_x&(s-1);
1270
            frac_y= src_y&(s-1);
1271
            src_x>>=shift;
1272
            src_y>>=shift;
1273

    
1274
            if((unsigned)src_x < width){
1275
                if((unsigned)src_y < height){
1276
                    index= src_x + src_y*stride;
1277
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1278
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1279
                                        + (  src[index+stride  ]*(s-frac_x)
1280
                                           + src[index+stride+1]*   frac_x )*   frac_y
1281
                                        + r)>>(shift*2);
1282
                }else{
1283
                    index= src_x + av_clip(src_y, 0, height)*stride;
1284
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1285
                                          + src[index       +1]*   frac_x )*s
1286
                                        + r)>>(shift*2);
1287
                }
1288
            }else{
1289
                if((unsigned)src_y < height){
1290
                    index= av_clip(src_x, 0, width) + src_y*stride;
1291
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1292
                                           + src[index+stride  ]*   frac_y )*s
1293
                                        + r)>>(shift*2);
1294
                }else{
1295
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1296
                    dst[y*stride + x]=    src[index         ];
1297
                }
1298
            }
1299

    
1300
            vx+= dxx;
1301
            vy+= dyx;
1302
        }
1303
        ox += dxy;
1304
        oy += dyy;
1305
    }
1306
}
1307

    
1308
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309
    switch(width){
1310
    case 2: put_pixels2_c (dst, src, stride, height); break;
1311
    case 4: put_pixels4_c (dst, src, stride, height); break;
1312
    case 8: put_pixels8_c (dst, src, stride, height); break;
1313
    case 16:put_pixels16_c(dst, src, stride, height); break;
1314
    }
1315
}
1316

    
1317
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318
    int i,j;
1319
    for (i=0; i < height; i++) {
1320
      for (j=0; j < width; j++) {
1321
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1322
      }
1323
      src += stride;
1324
      dst += stride;
1325
    }
1326
}
1327

    
1328
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329
    int i,j;
1330
    for (i=0; i < height; i++) {
1331
      for (j=0; j < width; j++) {
1332
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1333
      }
1334
      src += stride;
1335
      dst += stride;
1336
    }
1337
}
1338

    
1339
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340
    int i,j;
1341
    for (i=0; i < height; i++) {
1342
      for (j=0; j < width; j++) {
1343
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1344
      }
1345
      src += stride;
1346
      dst += stride;
1347
    }
1348
}
1349

    
1350
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351
    int i,j;
1352
    for (i=0; i < height; i++) {
1353
      for (j=0; j < width; j++) {
1354
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1355
      }
1356
      src += stride;
1357
      dst += stride;
1358
    }
1359
}
1360

    
1361
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362
    int i,j;
1363
    for (i=0; i < height; i++) {
1364
      for (j=0; j < width; j++) {
1365
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1366
      }
1367
      src += stride;
1368
      dst += stride;
1369
    }
1370
}
1371

    
1372
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373
    int i,j;
1374
    for (i=0; i < height; i++) {
1375
      for (j=0; j < width; j++) {
1376
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1377
      }
1378
      src += stride;
1379
      dst += stride;
1380
    }
1381
}
1382

    
1383
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1384
    int i,j;
1385
    for (i=0; i < height; i++) {
1386
      for (j=0; j < width; j++) {
1387
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1388
      }
1389
      src += stride;
1390
      dst += stride;
1391
    }
1392
}
1393

    
1394
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1395
    int i,j;
1396
    for (i=0; i < height; i++) {
1397
      for (j=0; j < width; j++) {
1398
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1399
      }
1400
      src += stride;
1401
      dst += stride;
1402
    }
1403
}
1404

    
1405
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1406
    switch(width){
1407
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1408
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1409
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1410
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1411
    }
1412
}
1413

    
1414
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415
    int i,j;
1416
    for (i=0; i < height; i++) {
1417
      for (j=0; j < width; j++) {
1418
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1419
      }
1420
      src += stride;
1421
      dst += stride;
1422
    }
1423
}
1424

    
1425
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426
    int i,j;
1427
    for (i=0; i < height; i++) {
1428
      for (j=0; j < width; j++) {
1429
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1430
      }
1431
      src += stride;
1432
      dst += stride;
1433
    }
1434
}
1435

    
1436
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437
    int i,j;
1438
    for (i=0; i < height; i++) {
1439
      for (j=0; j < width; j++) {
1440
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1441
      }
1442
      src += stride;
1443
      dst += stride;
1444
    }
1445
}
1446

    
1447
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448
    int i,j;
1449
    for (i=0; i < height; i++) {
1450
      for (j=0; j < width; j++) {
1451
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1452
      }
1453
      src += stride;
1454
      dst += stride;
1455
    }
1456
}
1457

    
1458
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459
    int i,j;
1460
    for (i=0; i < height; i++) {
1461
      for (j=0; j < width; j++) {
1462
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1463
      }
1464
      src += stride;
1465
      dst += stride;
1466
    }
1467
}
1468

    
1469
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1470
    int i,j;
1471
    for (i=0; i < height; i++) {
1472
      for (j=0; j < width; j++) {
1473
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1474
      }
1475
      src += stride;
1476
      dst += stride;
1477
    }
1478
}
1479

    
1480
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1481
    int i,j;
1482
    for (i=0; i < height; i++) {
1483
      for (j=0; j < width; j++) {
1484
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1485
      }
1486
      src += stride;
1487
      dst += stride;
1488
    }
1489
}
1490

    
1491
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1492
    int i,j;
1493
    for (i=0; i < height; i++) {
1494
      for (j=0; j < width; j++) {
1495
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1496
      }
1497
      src += stride;
1498
      dst += stride;
1499
    }
1500
}
1501
#if 0
1502
#define TPEL_WIDTH(width)\
1503
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1504
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1505
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1506
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1507
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1508
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1509
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1510
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1511
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1512
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1513
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1514
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1515
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1516
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1517
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1518
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1519
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1520
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1521
#endif
1522

    
1523
#define H264_CHROMA_MC(OPNAME, OP)\
1524
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1525
    const int A=(8-x)*(8-y);\
1526
    const int B=(  x)*(8-y);\
1527
    const int C=(8-x)*(  y);\
1528
    const int D=(  x)*(  y);\
1529
    int i;\
1530
    \
1531
    assert(x<8 && y<8 && x>=0 && y>=0);\
1532
\
1533
    if(D){\
1534
        for(i=0; i<h; i++){\
1535
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1536
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1537
            dst+= stride;\
1538
            src+= stride;\
1539
        }\
1540
    }else{\
1541
        const int E= B+C;\
1542
        const int step= C ? stride : 1;\
1543
        for(i=0; i<h; i++){\
1544
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1545
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1546
            dst+= stride;\
1547
            src+= stride;\
1548
        }\
1549
    }\
1550
}\
1551
\
1552
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1553
    const int A=(8-x)*(8-y);\
1554
    const int B=(  x)*(8-y);\
1555
    const int C=(8-x)*(  y);\
1556
    const int D=(  x)*(  y);\
1557
    int i;\
1558
    \
1559
    assert(x<8 && y<8 && x>=0 && y>=0);\
1560
\
1561
    if(D){\
1562
        for(i=0; i<h; i++){\
1563
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1564
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1565
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1566
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1567
            dst+= stride;\
1568
            src+= stride;\
1569
        }\
1570
    }else{\
1571
        const int E= B+C;\
1572
        const int step= C ? stride : 1;\
1573
        for(i=0; i<h; i++){\
1574
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1575
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1576
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1577
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1578
            dst+= stride;\
1579
            src+= stride;\
1580
        }\
1581
    }\
1582
}\
1583
\
1584
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1585
    const int A=(8-x)*(8-y);\
1586
    const int B=(  x)*(8-y);\
1587
    const int C=(8-x)*(  y);\
1588
    const int D=(  x)*(  y);\
1589
    int i;\
1590
    \
1591
    assert(x<8 && y<8 && x>=0 && y>=0);\
1592
\
1593
    if(D){\
1594
        for(i=0; i<h; i++){\
1595
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1596
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1597
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1598
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1599
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1600
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1601
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1602
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1603
            dst+= stride;\
1604
            src+= stride;\
1605
        }\
1606
    }else{\
1607
        const int E= B+C;\
1608
        const int step= C ? stride : 1;\
1609
        for(i=0; i<h; i++){\
1610
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1611
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1612
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1613
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1614
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1615
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1616
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1617
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1618
            dst+= stride;\
1619
            src+= stride;\
1620
        }\
1621
    }\
1622
}
1623

    
1624
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1625
#define op_put(a, b) a = (((b) + 32)>>6)
1626

    
1627
H264_CHROMA_MC(put_       , op_put)
1628
H264_CHROMA_MC(avg_       , op_avg)
1629
#undef op_avg
1630
#undef op_put
1631

    
1632
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1633
    const int A=(8-x)*(8-y);
1634
    const int B=(  x)*(8-y);
1635
    const int C=(8-x)*(  y);
1636
    const int D=(  x)*(  y);
1637
    int i;
1638

    
1639
    assert(x<8 && y<8 && x>=0 && y>=0);
1640

    
1641
    for(i=0; i<h; i++)
1642
    {
1643
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1644
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1645
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1646
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1647
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1648
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1649
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1650
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1651
        dst+= stride;
1652
        src+= stride;
1653
    }
1654
}
1655

    
1656
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1657
    const int A=(8-x)*(8-y);
1658
    const int B=(  x)*(8-y);
1659
    const int C=(8-x)*(  y);
1660
    const int D=(  x)*(  y);
1661
    int i;
1662

    
1663
    assert(x<8 && y<8 && x>=0 && y>=0);
1664

    
1665
    for(i=0; i<h; i++)
1666
    {
1667
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1668
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1669
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1670
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1671
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1672
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1673
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1674
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1675
        dst+= stride;
1676
        src+= stride;
1677
    }
1678
}
1679

    
1680
#define QPEL_MC(r, OPNAME, RND, OP) \
1681
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1682
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1683
    int i;\
1684
    for(i=0; i<h; i++)\
1685
    {\
1686
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1687
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1688
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1689
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1690
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1691
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1692
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1693
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1694
        dst+=dstStride;\
1695
        src+=srcStride;\
1696
    }\
1697
}\
1698
\
1699
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1700
    const int w=8;\
1701
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1702
    int i;\
1703
    for(i=0; i<w; i++)\
1704
    {\
1705
        const int src0= src[0*srcStride];\
1706
        const int src1= src[1*srcStride];\
1707
        const int src2= src[2*srcStride];\
1708
        const int src3= src[3*srcStride];\
1709
        const int src4= src[4*srcStride];\
1710
        const int src5= src[5*srcStride];\
1711
        const int src6= src[6*srcStride];\
1712
        const int src7= src[7*srcStride];\
1713
        const int src8= src[8*srcStride];\
1714
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1715
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1716
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1717
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1718
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1719
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1720
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1721
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1722
        dst++;\
1723
        src++;\
1724
    }\
1725
}\
1726
\
1727
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1728
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1729
    int i;\
1730
    \
1731
    for(i=0; i<h; i++)\
1732
    {\
1733
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1734
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1735
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1736
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1737
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1738
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1739
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1740
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1741
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1742
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1743
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1744
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1745
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1746
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1747
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1748
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1749
        dst+=dstStride;\
1750
        src+=srcStride;\
1751
    }\
1752
}\
1753
\
1754
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1755
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1756
    int i;\
1757
    const int w=16;\
1758
    for(i=0; i<w; i++)\
1759
    {\
1760
        const int src0= src[0*srcStride];\
1761
        const int src1= src[1*srcStride];\
1762
        const int src2= src[2*srcStride];\
1763
        const int src3= src[3*srcStride];\
1764
        const int src4= src[4*srcStride];\
1765
        const int src5= src[5*srcStride];\
1766
        const int src6= src[6*srcStride];\
1767
        const int src7= src[7*srcStride];\
1768
        const int src8= src[8*srcStride];\
1769
        const int src9= src[9*srcStride];\
1770
        const int src10= src[10*srcStride];\
1771
        const int src11= src[11*srcStride];\
1772
        const int src12= src[12*srcStride];\
1773
        const int src13= src[13*srcStride];\
1774
        const int src14= src[14*srcStride];\
1775
        const int src15= src[15*srcStride];\
1776
        const int src16= src[16*srcStride];\
1777
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1778
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1779
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1780
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1781
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1782
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1783
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1784
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1785
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1786
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1787
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1788
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1789
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1790
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1791
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1792
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1793
        dst++;\
1794
        src++;\
1795
    }\
1796
}\
1797
\
1798
static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1799
    OPNAME ## pixels8_c(dst, src, stride, 8);\
1800
}\
1801
\
1802
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1803
    uint8_t half[64];\
1804
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1805
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1806
}\
1807
\
1808
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1809
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1810
}\
1811
\
1812
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1813
    uint8_t half[64];\
1814
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1815
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1816
}\
1817
\
1818
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1819
    uint8_t full[16*9];\
1820
    uint8_t half[64];\
1821
    copy_block9(full, src, 16, stride, 9);\
1822
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1823
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1824
}\
1825
\
1826
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1827
    uint8_t full[16*9];\
1828
    copy_block9(full, src, 16, stride, 9);\
1829
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1830
}\
1831
\
1832
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1833
    uint8_t full[16*9];\
1834
    uint8_t half[64];\
1835
    copy_block9(full, src, 16, stride, 9);\
1836
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1837
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1838
}\
1839
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1840
    uint8_t full[16*9];\
1841
    uint8_t halfH[72];\
1842
    uint8_t halfV[64];\
1843
    uint8_t halfHV[64];\
1844
    copy_block9(full, src, 16, stride, 9);\
1845
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1846
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1847
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1848
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1849
}\
1850
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1851
    uint8_t full[16*9];\
1852
    uint8_t halfH[72];\
1853
    uint8_t halfHV[64];\
1854
    copy_block9(full, src, 16, stride, 9);\
1855
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1857
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1859
}\
1860
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    uint8_t full[16*9];\
1862
    uint8_t halfH[72];\
1863
    uint8_t halfV[64];\
1864
    uint8_t halfHV[64];\
1865
    copy_block9(full, src, 16, stride, 9);\
1866
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1867
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1868
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1869
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1870
}\
1871
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1872
    uint8_t full[16*9];\
1873
    uint8_t halfH[72];\
1874
    uint8_t halfHV[64];\
1875
    copy_block9(full, src, 16, stride, 9);\
1876
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1877
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1878
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1879
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1880
}\
1881
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882
    uint8_t full[16*9];\
1883
    uint8_t halfH[72];\
1884
    uint8_t halfV[64];\
1885
    uint8_t halfHV[64];\
1886
    copy_block9(full, src, 16, stride, 9);\
1887
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1888
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1891
}\
1892
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[16*9];\
1894
    uint8_t halfH[72];\
1895
    uint8_t halfHV[64];\
1896
    copy_block9(full, src, 16, stride, 9);\
1897
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1898
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1899
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1900
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1901
}\
1902
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903
    uint8_t full[16*9];\
1904
    uint8_t halfH[72];\
1905
    uint8_t halfV[64];\
1906
    uint8_t halfHV[64];\
1907
    copy_block9(full, src, 16, stride, 9);\
1908
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1909
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1910
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1912
}\
1913
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t full[16*9];\
1915
    uint8_t halfH[72];\
1916
    uint8_t halfHV[64];\
1917
    copy_block9(full, src, 16, stride, 9);\
1918
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1919
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1920
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1921
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1922
}\
1923
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1924
    uint8_t halfH[72];\
1925
    uint8_t halfHV[64];\
1926
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1927
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1928
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1929
}\
1930
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1931
    uint8_t halfH[72];\
1932
    uint8_t halfHV[64];\
1933
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1934
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1935
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1936
}\
1937
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1938
    uint8_t full[16*9];\
1939
    uint8_t halfH[72];\
1940
    uint8_t halfV[64];\
1941
    uint8_t halfHV[64];\
1942
    copy_block9(full, src, 16, stride, 9);\
1943
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1945
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1947
}\
1948
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t full[16*9];\
1950
    uint8_t halfH[72];\
1951
    copy_block9(full, src, 16, stride, 9);\
1952
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1953
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1954
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1955
}\
1956
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957
    uint8_t full[16*9];\
1958
    uint8_t halfH[72];\
1959
    uint8_t halfV[64];\
1960
    uint8_t halfHV[64];\
1961
    copy_block9(full, src, 16, stride, 9);\
1962
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1964
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1966
}\
1967
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1968
    uint8_t full[16*9];\
1969
    uint8_t halfH[72];\
1970
    copy_block9(full, src, 16, stride, 9);\
1971
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1974
}\
1975
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t halfH[72];\
1977
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1978
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1979
}\
1980
static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1981
    OPNAME ## pixels16_c(dst, src, stride, 16);\
1982
}\
1983
\
1984
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1985
    uint8_t half[256];\
1986
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1987
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1988
}\
1989
\
1990
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1991
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1992
}\
1993
\
1994
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1995
    uint8_t half[256];\
1996
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1997
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1998
}\
1999
\
2000
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t full[24*17];\
2002
    uint8_t half[256];\
2003
    copy_block17(full, src, 24, stride, 17);\
2004
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2005
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2006
}\
2007
\
2008
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2009
    uint8_t full[24*17];\
2010
    copy_block17(full, src, 24, stride, 17);\
2011
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2012
}\
2013
\
2014
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2015
    uint8_t full[24*17];\
2016
    uint8_t half[256];\
2017
    copy_block17(full, src, 24, stride, 17);\
2018
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2019
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2020
}\
2021
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2022
    uint8_t full[24*17];\
2023
    uint8_t halfH[272];\
2024
    uint8_t halfV[256];\
2025
    uint8_t halfHV[256];\
2026
    copy_block17(full, src, 24, stride, 17);\
2027
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2031
}\
2032
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t full[24*17];\
2034
    uint8_t halfH[272];\
2035
    uint8_t halfHV[256];\
2036
    copy_block17(full, src, 24, stride, 17);\
2037
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2039
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2041
}\
2042
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043
    uint8_t full[24*17];\
2044
    uint8_t halfH[272];\
2045
    uint8_t halfV[256];\
2046
    uint8_t halfHV[256];\
2047
    copy_block17(full, src, 24, stride, 17);\
2048
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2050
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2052
}\
2053
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2054
    uint8_t full[24*17];\
2055
    uint8_t halfH[272];\
2056
    uint8_t halfHV[256];\
2057
    copy_block17(full, src, 24, stride, 17);\
2058
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2059
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2060
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2061
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2062
}\
2063
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2064
    uint8_t full[24*17];\
2065
    uint8_t halfH[272];\
2066
    uint8_t halfV[256];\
2067
    uint8_t halfHV[256];\
2068
    copy_block17(full, src, 24, stride, 17);\
2069
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2070
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2071
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2073
}\
2074
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2075
    uint8_t full[24*17];\
2076
    uint8_t halfH[272];\
2077
    uint8_t halfHV[256];\
2078
    copy_block17(full, src, 24, stride, 17);\
2079
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2080
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2081
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2082
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2083
}\
2084
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2085
    uint8_t full[24*17];\
2086
    uint8_t halfH[272];\
2087
    uint8_t halfV[256];\
2088
    uint8_t halfHV[256];\
2089
    copy_block17(full, src, 24, stride, 17);\
2090
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2091
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2092
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2094
}\
2095
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2096
    uint8_t full[24*17];\
2097
    uint8_t halfH[272];\
2098
    uint8_t halfHV[256];\
2099
    copy_block17(full, src, 24, stride, 17);\
2100
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2101
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2102
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2103
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2104
}\
2105
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2106
    uint8_t halfH[272];\
2107
    uint8_t halfHV[256];\
2108
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2109
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2110
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2111
}\
2112
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2113
    uint8_t halfH[272];\
2114
    uint8_t halfHV[256];\
2115
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2116
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2117
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2118
}\
2119
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2120
    uint8_t full[24*17];\
2121
    uint8_t halfH[272];\
2122
    uint8_t halfV[256];\
2123
    uint8_t halfHV[256];\
2124
    copy_block17(full, src, 24, stride, 17);\
2125
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2127
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2129
}\
2130
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2131
    uint8_t full[24*17];\
2132
    uint8_t halfH[272];\
2133
    copy_block17(full, src, 24, stride, 17);\
2134
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2135
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2136
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2137
}\
2138
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139
    uint8_t full[24*17];\
2140
    uint8_t halfH[272];\
2141
    uint8_t halfV[256];\
2142
    uint8_t halfHV[256];\
2143
    copy_block17(full, src, 24, stride, 17);\
2144
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2146
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2148
}\
2149
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2150
    uint8_t full[24*17];\
2151
    uint8_t halfH[272];\
2152
    copy_block17(full, src, 24, stride, 17);\
2153
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2156
}\
2157
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2158
    uint8_t halfH[272];\
2159
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2160
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2161
}
2162

    
2163
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2164
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2165
#define op_put(a, b) a = cm[((b) + 16)>>5]
2166
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2167

    
2168
QPEL_MC(0, put_       , _       , op_put)
2169
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2170
QPEL_MC(0, avg_       , _       , op_avg)
2171
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2172
#undef op_avg
2173
#undef op_avg_no_rnd
2174
#undef op_put
2175
#undef op_put_no_rnd
2176

    
2177
#if 1
2178
#define H264_LOWPASS(OPNAME, OP, OP2) \
2179
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180
    const int h=2;\
2181
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182
    int i;\
2183
    for(i=0; i<h; i++)\
2184
    {\
2185
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2186
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2187
        dst+=dstStride;\
2188
        src+=srcStride;\
2189
    }\
2190
}\
2191
\
2192
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193
    const int w=2;\
2194
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2195
    int i;\
2196
    for(i=0; i<w; i++)\
2197
    {\
2198
        const int srcB= src[-2*srcStride];\
2199
        const int srcA= src[-1*srcStride];\
2200
        const int src0= src[0 *srcStride];\
2201
        const int src1= src[1 *srcStride];\
2202
        const int src2= src[2 *srcStride];\
2203
        const int src3= src[3 *srcStride];\
2204
        const int src4= src[4 *srcStride];\
2205
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2206
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2207
        dst++;\
2208
        src++;\
2209
    }\
2210
}\
2211
\
2212
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2213
    const int h=2;\
2214
    const int w=2;\
2215
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2216
    int i;\
2217
    src -= 2*srcStride;\
2218
    for(i=0; i<h+5; i++)\
2219
    {\
2220
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2221
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2222
        tmp+=tmpStride;\
2223
        src+=srcStride;\
2224
    }\
2225
    tmp -= tmpStride*(h+5-2);\
2226
    for(i=0; i<w; i++)\
2227
    {\
2228
        const int tmpB= tmp[-2*tmpStride];\
2229
        const int tmpA= tmp[-1*tmpStride];\
2230
        const int tmp0= tmp[0 *tmpStride];\
2231
        const int tmp1= tmp[1 *tmpStride];\
2232
        const int tmp2= tmp[2 *tmpStride];\
2233
        const int tmp3= tmp[3 *tmpStride];\
2234
        const int tmp4= tmp[4 *tmpStride];\
2235
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2236
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2237
        dst++;\
2238
        tmp++;\
2239
    }\
2240
}\
2241
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2242
    const int h=4;\
2243
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2244
    int i;\
2245
    for(i=0; i<h; i++)\
2246
    {\
2247
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2248
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2249
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2250
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2251
        dst+=dstStride;\
2252
        src+=srcStride;\
2253
    }\
2254
}\
2255
\
2256
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2257
    const int w=4;\
2258
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2259
    int i;\
2260
    for(i=0; i<w; i++)\
2261
    {\
2262
        const int srcB= src[-2*srcStride];\
2263
        const int srcA= src[-1*srcStride];\
2264
        const int src0= src[0 *srcStride];\
2265
        const int src1= src[1 *srcStride];\
2266
        const int src2= src[2 *srcStride];\
2267
        const int src3= src[3 *srcStride];\
2268
        const int src4= src[4 *srcStride];\
2269
        const int src5= src[5 *srcStride];\
2270
        const int src6= src[6 *srcStride];\
2271
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2272
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2273
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2274
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2275
        dst++;\
2276
        src++;\
2277
    }\
2278
}\
2279
\
2280
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2281
    const int h=4;\
2282
    const int w=4;\
2283
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284
    int i;\
2285
    src -= 2*srcStride;\
2286
    for(i=0; i<h+5; i++)\
2287
    {\
2288
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2289
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2290
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2291
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2292
        tmp+=tmpStride;\
2293
        src+=srcStride;\
2294
    }\
2295
    tmp -= tmpStride*(h+5-2);\
2296
    for(i=0; i<w; i++)\
2297
    {\
2298
        const int tmpB= tmp[-2*tmpStride];\
2299
        const int tmpA= tmp[-1*tmpStride];\
2300
        const int tmp0= tmp[0 *tmpStride];\
2301
        const int tmp1= tmp[1 *tmpStride];\
2302
        const int tmp2= tmp[2 *tmpStride];\
2303
        const int tmp3= tmp[3 *tmpStride];\
2304
        const int tmp4= tmp[4 *tmpStride];\
2305
        const int tmp5= tmp[5 *tmpStride];\
2306
        const int tmp6= tmp[6 *tmpStride];\
2307
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2308
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2309
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2310
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2311
        dst++;\
2312
        tmp++;\
2313
    }\
2314
}\
2315
\
2316
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2317
    const int h=8;\
2318
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2319
    int i;\
2320
    for(i=0; i<h; i++)\
2321
    {\
2322
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2323
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2324
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2325
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2326
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2327
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2328
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2329
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2330
        dst+=dstStride;\
2331
        src+=srcStride;\
2332
    }\
2333
}\
2334
\
2335
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2336
    const int w=8;\
2337
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338
    int i;\
2339
    for(i=0; i<w; i++)\
2340
    {\
2341
        const int srcB= src[-2*srcStride];\
2342
        const int srcA= src[-1*srcStride];\
2343
        const int src0= src[0 *srcStride];\
2344
        const int src1= src[1 *srcStride];\
2345
        const int src2= src[2 *srcStride];\
2346
        const int src3= src[3 *srcStride];\
2347
        const int src4= src[4 *srcStride];\
2348
        const int src5= src[5 *srcStride];\
2349
        const int src6= src[6 *srcStride];\
2350
        const int src7= src[7 *srcStride];\
2351
        const int src8= src[8 *srcStride];\
2352
        const int src9= src[9 *srcStride];\
2353
        const int src10=src[10*srcStride];\
2354
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2355
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2356
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2357
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2358
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2359
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2360
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2361
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2362
        dst++;\
2363
        src++;\
2364
    }\
2365
}\
2366
\
2367
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2368
    const int h=8;\
2369
    const int w=8;\
2370
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2371
    int i;\
2372
    src -= 2*srcStride;\
2373
    for(i=0; i<h+5; i++)\
2374
    {\
2375
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2376
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2377
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2378
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2379
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2380
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2381
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2382
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2383
        tmp+=tmpStride;\
2384
        src+=srcStride;\
2385
    }\
2386
    tmp -= tmpStride*(h+5-2);\
2387
    for(i=0; i<w; i++)\
2388
    {\
2389
        const int tmpB= tmp[-2*tmpStride];\
2390
        const int tmpA= tmp[-1*tmpStride];\
2391
        const int tmp0= tmp[0 *tmpStride];\
2392
        const int tmp1= tmp[1 *tmpStride];\
2393
        const int tmp2= tmp[2 *tmpStride];\
2394
        const int tmp3= tmp[3 *tmpStride];\
2395
        const int tmp4= tmp[4 *tmpStride];\
2396
        const int tmp5= tmp[5 *tmpStride];\
2397
        const int tmp6= tmp[6 *tmpStride];\
2398
        const int tmp7= tmp[7 *tmpStride];\
2399
        const int tmp8= tmp[8 *tmpStride];\
2400
        const int tmp9= tmp[9 *tmpStride];\
2401
        const int tmp10=tmp[10*tmpStride];\
2402
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2403
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2404
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2405
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2406
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2407
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2408
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2409
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2410
        dst++;\
2411
        tmp++;\
2412
    }\
2413
}\
2414
\
2415
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2416
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2417
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2418
    src += 8*srcStride;\
2419
    dst += 8*dstStride;\
2420
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2421
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2422
}\
2423
\
2424
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2425
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2426
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2427
    src += 8*srcStride;\
2428
    dst += 8*dstStride;\
2429
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2430
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2431
}\
2432
\
2433
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2434
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2435
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2436
    src += 8*srcStride;\
2437
    dst += 8*dstStride;\
2438
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2439
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2440
}\
2441

    
2442
#define H264_MC(OPNAME, SIZE) \
2443
static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2444
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2445
}\
2446
\
2447
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2448
    uint8_t half[SIZE*SIZE];\
2449
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2450
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2451
}\
2452
\
2453
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2454
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2455
}\
2456
\
2457
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2458
    uint8_t half[SIZE*SIZE];\
2459
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2460
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2461
}\
2462
\
2463
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2464
    uint8_t full[SIZE*(SIZE+5)];\
2465
    uint8_t * const full_mid= full + SIZE*2;\
2466
    uint8_t half[SIZE*SIZE];\
2467
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2468
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2469
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    uint8_t full[SIZE*(SIZE+5)];\
2474
    uint8_t * const full_mid= full + SIZE*2;\
2475
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2476
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2477
}\
2478
\
2479
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2480
    uint8_t full[SIZE*(SIZE+5)];\
2481
    uint8_t * const full_mid= full + SIZE*2;\
2482
    uint8_t half[SIZE*SIZE];\
2483
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2484
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2485
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2486
}\
2487
\
2488
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2489
    uint8_t full[SIZE*(SIZE+5)];\
2490
    uint8_t * const full_mid= full + SIZE*2;\
2491
    uint8_t halfH[SIZE*SIZE];\
2492
    uint8_t halfV[SIZE*SIZE];\
2493
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2494
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2495
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2496
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2497
}\
2498
\
2499
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2500
    uint8_t full[SIZE*(SIZE+5)];\
2501
    uint8_t * const full_mid= full + SIZE*2;\
2502
    uint8_t halfH[SIZE*SIZE];\
2503
    uint8_t halfV[SIZE*SIZE];\
2504
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2505
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2506
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2507
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2508
}\
2509
\
2510
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2511
    uint8_t full[SIZE*(SIZE+5)];\
2512
    uint8_t * const full_mid= full + SIZE*2;\
2513
    uint8_t halfH[SIZE*SIZE];\
2514
    uint8_t halfV[SIZE*SIZE];\
2515
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2516
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2517
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2518
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2519
}\
2520
\
2521
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2522
    uint8_t full[SIZE*(SIZE+5)];\
2523
    uint8_t * const full_mid= full + SIZE*2;\
2524
    uint8_t halfH[SIZE*SIZE];\
2525
    uint8_t halfV[SIZE*SIZE];\
2526
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2527
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2528
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2530
}\
2531
\
2532
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2533
    int16_t tmp[SIZE*(SIZE+5)];\
2534
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2535
}\
2536
\
2537
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2538
    int16_t tmp[SIZE*(SIZE+5)];\
2539
    uint8_t halfH[SIZE*SIZE];\
2540
    uint8_t halfHV[SIZE*SIZE];\
2541
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2542
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2543
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2544
}\
2545
\
2546
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2547
    int16_t tmp[SIZE*(SIZE+5)];\
2548
    uint8_t halfH[SIZE*SIZE];\
2549
    uint8_t halfHV[SIZE*SIZE];\
2550
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2551
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2552
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2553
}\
2554
\
2555
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2556
    uint8_t full[SIZE*(SIZE+5)];\
2557
    uint8_t * const full_mid= full + SIZE*2;\
2558
    int16_t tmp[SIZE*(SIZE+5)];\
2559
    uint8_t halfV[SIZE*SIZE];\
2560
    uint8_t halfHV[SIZE*SIZE];\
2561
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2562
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2563
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2564
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2565
}\
2566
\
2567
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2568
    uint8_t full[SIZE*(SIZE+5)];\
2569
    uint8_t * const full_mid= full + SIZE*2;\
2570
    int16_t tmp[SIZE*(SIZE+5)];\
2571
    uint8_t halfV[SIZE*SIZE];\
2572
    uint8_t halfHV[SIZE*SIZE];\
2573
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2574
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2575
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2576
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2577
}\
2578

    
2579
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2580
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2581
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2582
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2583
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2584

    
2585
H264_LOWPASS(put_       , op_put, op2_put)
2586
H264_LOWPASS(avg_       , op_avg, op2_avg)
2587
H264_MC(put_, 2)
2588
H264_MC(put_, 4)
2589
H264_MC(put_, 8)
2590
H264_MC(put_, 16)
2591
H264_MC(avg_, 4)
2592
H264_MC(avg_, 8)
2593
H264_MC(avg_, 16)
2594

    
2595
#undef op_avg
2596
#undef op_put
2597
#undef op2_avg
2598
#undef op2_put
2599
#endif
2600

    
2601
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2602
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2603
    int i;
2604

    
2605
    for(i=0; i<h; i++){
2606
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2607
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2608
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2609
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2610
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2611
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2612
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2613
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2614
        dst+=dstStride;
2615
        src+=srcStride;
2616
    }
2617
}
2618

    
2619
#if CONFIG_CAVS_DECODER
2620
/* AVS specific */
2621
void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2622
    put_pixels8_c(dst, src, stride, 8);
2623
}
2624
void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2625
    avg_pixels8_c(dst, src, stride, 8);
2626
}
2627
void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2628
    put_pixels16_c(dst, src, stride, 16);
2629
}
2630
void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2631
    avg_pixels16_c(dst, src, stride, 16);
2632
}
2633
#endif /* CONFIG_CAVS_DECODER */
2634

    
2635
#if CONFIG_VC1_DECODER
2636
/* VC-1 specific */
2637
void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2638
    put_pixels8_c(dst, src, stride, 8);
2639
}
2640
void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2641
    avg_pixels8_c(dst, src, stride, 8);
2642
}
2643
#endif /* CONFIG_VC1_DECODER */
2644

    
2645
#if CONFIG_RV40_DECODER
2646
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2647
    put_pixels16_xy2_c(dst, src, stride, 16);
2648
}
2649
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2650
    avg_pixels16_xy2_c(dst, src, stride, 16);
2651
}
2652
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2653
    put_pixels8_xy2_c(dst, src, stride, 8);
2654
}
2655
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2656
    avg_pixels8_xy2_c(dst, src, stride, 8);
2657
}
2658
#endif /* CONFIG_RV40_DECODER */
2659

    
2660
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2661
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2662
    int i;
2663

    
2664
    for(i=0; i<w; i++){
2665
        const int src_1= src[ -srcStride];
2666
        const int src0 = src[0          ];
2667
        const int src1 = src[  srcStride];
2668
        const int src2 = src[2*srcStride];
2669
        const int src3 = src[3*srcStride];
2670
        const int src4 = src[4*srcStride];
2671
        const int src5 = src[5*srcStride];
2672
        const int src6 = src[6*srcStride];
2673
        const int src7 = src[7*srcStride];
2674
        const int src8 = src[8*srcStride];
2675
        const int src9 = src[9*srcStride];
2676
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2677
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2678
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2679
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2680
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2681
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2682
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2683
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2684
        src++;
2685
        dst++;
2686
    }
2687
}
2688

    
2689
static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2690
    put_pixels8_c(dst, src, stride, 8);
2691
}
2692

    
2693
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2694
    uint8_t half[64];
2695
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2696
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2697
}
2698

    
2699
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2700
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2701
}
2702

    
2703
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2704
    uint8_t half[64];
2705
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2706
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2707
}
2708

    
2709
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2710
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2711
}
2712

    
2713
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2714
    uint8_t halfH[88];
2715
    uint8_t halfV[64];
2716
    uint8_t halfHV[64];
2717
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2718
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2719
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2720
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2721
}
2722
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2723
    uint8_t halfH[88];
2724
    uint8_t halfV[64];
2725
    uint8_t halfHV[64];
2726
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2727
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2728
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2729
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2730
}
2731
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2732
    uint8_t halfH[88];
2733
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2734
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2735
}
2736

    
2737
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2738
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2739
    int x;
2740
    const int strength= ff_h263_loop_filter_strength[qscale];
2741

    
2742
    for(x=0; x<8; x++){
2743
        int d1, d2, ad1;
2744
        int p0= src[x-2*stride];
2745
        int p1= src[x-1*stride];
2746
        int p2= src[x+0*stride];
2747
        int p3= src[x+1*stride];
2748
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2749

    
2750
        if     (d<-2*strength) d1= 0;
2751
        else if(d<-  strength) d1=-2*strength - d;
2752
        else if(d<   strength) d1= d;
2753
        else if(d< 2*strength) d1= 2*strength - d;
2754
        else                   d1= 0;
2755

    
2756
        p1 += d1;
2757
        p2 -= d1;
2758
        if(p1&256) p1= ~(p1>>31);
2759
        if(p2&256) p2= ~(p2>>31);
2760

    
2761
        src[x-1*stride] = p1;
2762
        src[x+0*stride] = p2;
2763

    
2764
        ad1= FFABS(d1)>>1;
2765

    
2766
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2767

    
2768
        src[x-2*stride] = p0 - d2;
2769
        src[x+  stride] = p3 + d2;
2770
    }
2771
    }
2772
}
2773

    
2774
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2775
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2776
    int y;
2777
    const int strength= ff_h263_loop_filter_strength[qscale];
2778

    
2779
    for(y=0; y<8; y++){
2780
        int d1, d2, ad1;
2781
        int p0= src[y*stride-2];
2782
        int p1= src[y*stride-1];
2783
        int p2= src[y*stride+0];
2784
        int p3= src[y*stride+1];
2785
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2786

    
2787
        if     (d<-2*strength) d1= 0;
2788
        else if(d<-  strength) d1=-2*strength - d;
2789
        else if(d<   strength) d1= d;
2790
        else if(d< 2*strength) d1= 2*strength - d;
2791
        else                   d1= 0;
2792

    
2793
        p1 += d1;
2794
        p2 -= d1;
2795
        if(p1&256) p1= ~(p1>>31);
2796
        if(p2&256) p2= ~(p2>>31);
2797

    
2798
        src[y*stride-1] = p1;
2799
        src[y*stride+0] = p2;
2800

    
2801
        ad1= FFABS(d1)>>1;
2802

    
2803
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2804

    
2805
        src[y*stride-2] = p0 - d2;
2806
        src[y*stride+1] = p3 + d2;
2807
    }
2808
    }
2809
}
2810

    
2811
static void h261_loop_filter_c(uint8_t *src, int stride){
2812
    int x,y,xy,yz;
2813
    int temp[64];
2814

    
2815
    for(x=0; x<8; x++){
2816
        temp[x      ] = 4*src[x           ];
2817
        temp[x + 7*8] = 4*src[x + 7*stride];
2818
    }
2819
    for(y=1; y<7; y++){
2820
        for(x=0; x<8; x++){
2821
            xy = y * stride + x;
2822
            yz = y * 8 + x;
2823
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2824
        }
2825
    }
2826

    
2827
    for(y=0; y<8; y++){
2828
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2829
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2830
        for(x=1; x<7; x++){
2831
            xy = y * stride + x;
2832
            yz = y * 8 + x;
2833
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2834
        }
2835
    }
2836
}
2837

    
2838
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2839
{
2840
    int s, i;
2841

    
2842
    s = 0;
2843
    for(i=0;i<h;i++) {
2844
        s += abs(pix1[0] - pix2[0]);
2845
        s += abs(pix1[1] - pix2[1]);
2846
        s += abs(pix1[2] - pix2[2]);
2847
        s += abs(pix1[3] - pix2[3]);
2848
        s += abs(pix1[4] - pix2[4]);
2849
        s += abs(pix1[5] - pix2[5]);
2850
        s += abs(pix1[6] - pix2[6]);
2851
        s += abs(pix1[7] - pix2[7]);
2852
        s += abs(pix1[8] - pix2[8]);
2853
        s += abs(pix1[9] - pix2[9]);
2854
        s += abs(pix1[10] - pix2[10]);
2855
        s += abs(pix1[11] - pix2[11]);
2856
        s += abs(pix1[12] - pix2[12]);
2857
        s += abs(pix1[13] - pix2[13]);
2858
        s += abs(pix1[14] - pix2[14]);
2859
        s += abs(pix1[15] - pix2[15]);
2860
        pix1 += line_size;
2861
        pix2 += line_size;
2862
    }
2863
    return s;
2864
}
2865

    
2866
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2867
{
2868
    int s, i;
2869

    
2870
    s = 0;
2871
    for(i=0;i<h;i++) {
2872
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2873
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2874
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2875
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2876
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2877
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2878
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2879
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2880
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2881
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2882
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2883
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2884
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2885
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2886
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2887
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2888
        pix1 += line_size;
2889
        pix2 += line_size;
2890
    }
2891
    return s;
2892
}
2893

    
2894
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2895
{
2896
    int s, i;
2897
    uint8_t *pix3 = pix2 + line_size;
2898

    
2899
    s = 0;
2900
    for(i=0;i<h;i++) {
2901
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2902
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2903
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2904
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2905
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2906
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2907
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2908
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2909
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2910
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2911
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2912
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2913
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2914
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2915
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2916
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2917
        pix1 += line_size;
2918
        pix2 += line_size;
2919
        pix3 += line_size;
2920
    }
2921
    return s;
2922
}
2923

    
2924
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2925
{
2926
    int s, i;
2927
    uint8_t *pix3 = pix2 + line_size;
2928

    
2929
    s = 0;
2930
    for(i=0;i<h;i++) {
2931
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2932
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2933
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2934
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2935
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2936
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2937
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2938
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2939
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2940
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2941
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2942
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2943
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2944
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2945
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2946
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2947
        pix1 += line_size;
2948
        pix2 += line_size;
2949
        pix3 += line_size;
2950
    }
2951
    return s;
2952
}
2953

    
2954
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2955
{
2956
    int s, i;
2957

    
2958
    s = 0;
2959
    for(i=0;i<h;i++) {
2960
        s += abs(pix1[0] - pix2[0]);
2961
        s += abs(pix1[1] - pix2[1]);
2962
        s += abs(pix1[2] - pix2[2]);
2963
        s += abs(pix1[3] - pix2[3]);
2964
        s += abs(pix1[4] - pix2[4]);
2965
        s += abs(pix1[5] - pix2[5]);
2966
        s += abs(pix1[6] - pix2[6]);
2967
        s += abs(pix1[7] - pix2[7]);
2968
        pix1 += line_size;
2969
        pix2 += line_size;
2970
    }
2971
    return s;
2972
}
2973

    
2974
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2975
{
2976
    int s, i;
2977

    
2978
    s = 0;
2979
    for(i=0;i<h;i++) {
2980
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2981
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2982
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2983
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2984
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2985
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2986
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2987
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2988
        pix1 += line_size;
2989
        pix2 += line_size;
2990
    }
2991
    return s;
2992
}
2993

    
2994
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2995
{
2996
    int s, i;
2997
    uint8_t *pix3 = pix2 + line_size;
2998

    
2999
    s = 0;
3000
    for(i=0;i<h;i++) {
3001
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3002
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3003
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3004
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3005
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3006
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3007
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3008
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3009
        pix1 += line_size;
3010
        pix2 += line_size;
3011
        pix3 += line_size;
3012
    }
3013
    return s;
3014
}
3015

    
3016
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3017
{
3018
    int s, i;
3019
    uint8_t *pix3 = pix2 + line_size;
3020

    
3021
    s = 0;
3022
    for(i=0;i<h;i++) {
3023
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3024
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3025
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3026
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3027
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3028
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3029
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3030
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3031
        pix1 += line_size;
3032
        pix2 += line_size;
3033
        pix3 += line_size;
3034
    }
3035
    return s;
3036
}
3037

    
3038
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3039
    MpegEncContext *c = v;
3040
    int score1=0;
3041
    int score2=0;
3042
    int x,y;
3043

    
3044
    for(y=0; y<h; y++){
3045
        for(x=0; x<16; x++){
3046
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3047
        }
3048
        if(y+1<h){
3049
            for(x=0; x<15; x++){
3050
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3051
                             - s1[x+1] + s1[x+1+stride])
3052
                        -FFABS(  s2[x  ] - s2[x  +stride]
3053
                             - s2[x+1] + s2[x+1+stride]);
3054
            }
3055
        }
3056
        s1+= stride;
3057
        s2+= stride;
3058
    }
3059

    
3060
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3061
    else  return score1 + FFABS(score2)*8;
3062
}
3063

    
3064
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3065
    MpegEncContext *c = v;
3066
    int score1=0;
3067
    int score2=0;
3068
    int x,y;
3069

    
3070
    for(y=0; y<h; y++){
3071
        for(x=0; x<8; x++){
3072
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3073
        }
3074
        if(y+1<h){
3075
            for(x=0; x<7; x++){
3076
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3077
                             - s1[x+1] + s1[x+1+stride])
3078
                        -FFABS(  s2[x  ] - s2[x  +stride]
3079
                             - s2[x+1] + s2[x+1+stride]);
3080
            }
3081
        }
3082
        s1+= stride;
3083
        s2+= stride;
3084
    }
3085

    
3086
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3087
    else  return score1 + FFABS(score2)*8;
3088
}
3089

    
3090
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3091
    int i;
3092
    unsigned int sum=0;
3093

    
3094
    for(i=0; i<8*8; i++){
3095
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3096
        int w= weight[i];
3097
        b>>= RECON_SHIFT;
3098
        assert(-512<b && b<512);
3099

    
3100
        sum += (w*b)*(w*b)>>4;
3101
    }
3102
    return sum>>2;
3103
}
3104

    
3105
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3106
    int i;
3107

    
3108
    for(i=0; i<8*8; i++){
3109
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3110
    }
3111
}
3112

    
3113
/**
3114
 * permutes an 8x8 block.
3115
 * @param block the block which will be permuted according to the given permutation vector
3116
 * @param permutation the permutation vector
3117
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3118
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3119
 *                  (inverse) permutated to scantable order!
3120
 */
3121
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3122
{
3123
    int i;
3124
    DCTELEM temp[64];
3125

    
3126
    if(last<=0) return;
3127
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3128

    
3129
    for(i=0; i<=last; i++){
3130
        const int j= scantable[i];
3131
        temp[j]= block[j];
3132
        block[j]=0;
3133
    }
3134

    
3135
    for(i=0; i<=last; i++){
3136
        const int j= scantable[i];
3137
        const int perm_j= permutation[j];
3138
        block[perm_j]= temp[j];
3139
    }
3140
}
3141

    
3142
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3143
    return 0;
3144
}
3145

    
3146
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3147
    int i;
3148

    
3149
    memset(cmp, 0, sizeof(void*)*6);
3150

    
3151
    for(i=0; i<6; i++){
3152
        switch(type&0xFF){
3153
        case FF_CMP_SAD:
3154
            cmp[i]= c->sad[i];
3155
            break;
3156
        case FF_CMP_SATD:
3157
            cmp[i]= c->hadamard8_diff[i];
3158
            break;
3159
        case FF_CMP_SSE:
3160
            cmp[i]= c->sse[i];
3161
            break;
3162
        case FF_CMP_DCT:
3163
            cmp[i]= c->dct_sad[i];
3164
            break;
3165
        case FF_CMP_DCT264:
3166
            cmp[i]= c->dct264_sad[i];
3167
            break;
3168
        case FF_CMP_DCTMAX:
3169
            cmp[i]= c->dct_max[i];
3170
            break;
3171
        case FF_CMP_PSNR:
3172
            cmp[i]= c->quant_psnr[i];
3173
            break;
3174
        case FF_CMP_BIT:
3175
            cmp[i]= c->bit[i];
3176
            break;
3177
        case FF_CMP_RD:
3178
            cmp[i]= c->rd[i];
3179
            break;
3180
        case FF_CMP_VSAD:
3181
            cmp[i]= c->vsad[i];
3182
            break;
3183
        case FF_CMP_VSSE:
3184
            cmp[i]= c->vsse[i];
3185
            break;
3186
        case FF_CMP_ZERO:
3187
            cmp[i]= zero_cmp;
3188
            break;
3189
        case FF_CMP_NSSE:
3190
            cmp[i]= c->nsse[i];
3191
            break;
3192
#if CONFIG_DWT
3193
        case FF_CMP_W53:
3194
            cmp[i]= c->w53[i];
3195
            break;
3196
        case FF_CMP_W97:
3197
            cmp[i]= c->w97[i];
3198
            break;
3199
#endif
3200
        default:
3201
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3202
        }
3203
    }
3204
}
3205

    
3206
static void clear_block_c(DCTELEM *block)
3207
{
3208
    memset(block, 0, sizeof(DCTELEM)*64);
3209
}
3210

    
3211
/**
3212
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3213
 */
3214
static void clear_blocks_c(DCTELEM *blocks)
3215
{
3216
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3217
}
3218

    
3219
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3220
    long i;
3221
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3222
        long a = *(long*)(src+i);
3223
        long b = *(long*)(dst+i);
3224
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3225
    }
3226
    for(; i<w; i++)
3227
        dst[i+0] += src[i+0];
3228
}
3229

    
3230
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3231
    long i;
3232
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3233
        long a = *(long*)(src1+i);
3234
        long b = *(long*)(src2+i);
3235
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3236
    }
3237
    for(; i<w; i++)
3238
        dst[i] = src1[i]+src2[i];
3239
}
3240

    
3241
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3242
    long i;
3243
#if !HAVE_FAST_UNALIGNED
3244
    if((long)src2 & (sizeof(long)-1)){
3245
        for(i=0; i+7<w; i+=8){
3246
            dst[i+0] = src1[i+0]-src2[i+0];
3247
            dst[i+1] = src1[i+1]-src2[i+1];
3248
            dst[i+2] = src1[i+2]-src2[i+2];
3249
            dst[i+3] = src1[i+3]-src2[i+3];
3250
            dst[i+4] = src1[i+4]-src2[i+4];
3251
            dst[i+5] = src1[i+5]-src2[i+5];
3252
            dst[i+6] = src1[i+6]-src2[i+6];
3253
            dst[i+7] = src1[i+7]-src2[i+7];
3254
        }
3255
    }else
3256
#endif
3257
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3258
        long a = *(long*)(src1+i);
3259
        long b = *(long*)(src2+i);
3260
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3261
    }
3262
    for(; i<w; i++)
3263
        dst[i+0] = src1[i+0]-src2[i+0];
3264
}
3265

    
3266
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3267
    int i;
3268
    uint8_t l, lt;
3269

    
3270
    l= *left;
3271
    lt= *left_top;
3272

    
3273
    for(i=0; i<w; i++){
3274
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3275
        lt= src1[i];
3276
        dst[i]= l;
3277
    }
3278

    
3279
    *left= l;
3280
    *left_top= lt;
3281
}
3282

    
3283
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3284
    int i;
3285
    uint8_t l, lt;
3286

    
3287
    l= *left;
3288
    lt= *left_top;
3289

    
3290
    for(i=0; i<w; i++){
3291
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3292
        lt= src1[i];
3293
        l= src2[i];
3294
        dst[i]= l - pred;
3295
    }
3296

    
3297
    *left= l;
3298
    *left_top= lt;
3299
}
3300

    
3301
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3302
    int i;
3303

    
3304
    for(i=0; i<w-1; i++){
3305
        acc+= src[i];
3306
        dst[i]= acc;
3307
        i++;
3308
        acc+= src[i];
3309
        dst[i]= acc;
3310
    }
3311

    
3312
    for(; i<w; i++){
3313
        acc+= src[i];
3314
        dst[i]= acc;
3315
    }
3316

    
3317
    return acc;
3318
}
3319

    
3320
#if HAVE_BIGENDIAN
3321
#define B 3
3322
#define G 2
3323
#define R 1
3324
#define A 0
3325
#else
3326
#define B 0
3327
#define G 1
3328
#define R 2
3329
#define A 3
3330
#endif
3331
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3332
    int i;
3333
    int r,g,b,a;
3334
    r= *red;
3335
    g= *green;
3336
    b= *blue;
3337
    a= *alpha;
3338

    
3339
    for(i=0; i<w; i++){
3340
        b+= src[4*i+B];
3341
        g+= src[4*i+G];
3342
        r+= src[4*i+R];
3343
        a+= src[4*i+A];
3344

    
3345
        dst[4*i+B]= b;
3346
        dst[4*i+G]= g;
3347
        dst[4*i+R]= r;
3348
        dst[4*i+A]= a;
3349
    }
3350

    
3351
    *red= r;
3352
    *green= g;
3353
    *blue= b;
3354
    *alpha= a;
3355
}
3356
#undef B
3357
#undef G
3358
#undef R
3359
#undef A
3360

    
3361
#define BUTTERFLY2(o1,o2,i1,i2) \
3362
o1= (i1)+(i2);\
3363
o2= (i1)-(i2);
3364

    
3365
#define BUTTERFLY1(x,y) \
3366
{\
3367
    int a,b;\
3368
    a= x;\
3369
    b= y;\
3370
    x= a+b;\
3371
    y= a-b;\
3372
}
3373

    
3374
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3375

    
3376
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3377
    int i;
3378
    int temp[64];
3379
    int sum=0;
3380

    
3381
    assert(h==8);
3382

    
3383
    for(i=0; i<8; i++){
3384
        //FIXME try pointer walks
3385
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3386
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3387
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3388
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3389

    
3390
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3391
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3392
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3393
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3394

    
3395
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3396
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3397
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3398
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3399
    }
3400

    
3401
    for(i=0; i<8; i++){
3402
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3403
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3404
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3405
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3406

    
3407
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3408
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3409
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3410
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3411

    
3412
        sum +=
3413
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3414
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3415
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3416
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3417
    }
3418
#if 0
3419
static int maxi=0;
3420
if(sum>maxi){
3421
    maxi=sum;
3422
    printf("MAX:%d\n", maxi);
3423
}
3424
#endif
3425
    return sum;
3426
}
3427

    
3428
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3429
    int i;
3430
    int temp[64];
3431
    int sum=0;
3432

    
3433
    assert(h==8);
3434

    
3435
    for(i=0; i<8; i++){
3436
        //FIXME try pointer walks
3437
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3438
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3439
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3440
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3441

    
3442
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3443
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3444
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3445
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3446

    
3447
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3448
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3449
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3450
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3451
    }
3452

    
3453
    for(i=0; i<8; i++){
3454
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3455
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3456
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3457
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3458

    
3459
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3460
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3461
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3462
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3463

    
3464
        sum +=
3465
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3466
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3467
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3468
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3469
    }
3470

    
3471
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3472

    
3473
    return sum;
3474
}
3475

    
3476
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3477
    MpegEncContext * const s= (MpegEncContext *)c;
3478
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3479

    
3480
    assert(h==8);
3481

    
3482
    s->dsp.diff_pixels(temp, src1, src2, stride);
3483
    s->dsp.fdct(temp);
3484
    return s->dsp.sum_abs_dctelem(temp);
3485
}
3486

    
3487
#if CONFIG_GPL
3488
#define DCT8_1D {\
3489
    const int s07 = SRC(0) + SRC(7);\
3490
    const int s16 = SRC(1) + SRC(6);\
3491
    const int s25 = SRC(2) + SRC(5);\
3492
    const int s34 = SRC(3) + SRC(4);\
3493
    const int a0 = s07 + s34;\
3494
    const int a1 = s16 + s25;\
3495
    const int a2 = s07 - s34;\
3496
    const int a3 = s16 - s25;\
3497
    const int d07 = SRC(0) - SRC(7);\
3498
    const int d16 = SRC(1) - SRC(6);\
3499
    const int d25 = SRC(2) - SRC(5);\
3500
    const int d34 = SRC(3) - SRC(4);\
3501
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3502
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3503
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3504
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3505
    DST(0,  a0 + a1     ) ;\
3506
    DST(1,  a4 + (a7>>2)) ;\
3507
    DST(2,  a2 + (a3>>1)) ;\
3508
    DST(3,  a5 + (a6>>2)) ;\
3509
    DST(4,  a0 - a1     ) ;\
3510
    DST(5,  a6 - (a5>>2)) ;\
3511
    DST(6, (a2>>1) - a3 ) ;\
3512
    DST(7, (a4>>2) - a7 ) ;\
3513
}
3514

    
3515
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3516
    MpegEncContext * const s= (MpegEncContext *)c;
3517
    DCTELEM dct[8][8];
3518
    int i;
3519
    int sum=0;
3520

    
3521
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3522

    
3523
#define SRC(x) dct[i][x]
3524
#define DST(x,v) dct[i][x]= v
3525
    for( i = 0; i < 8; i++ )
3526
        DCT8_1D
3527
#undef SRC
3528
#undef DST
3529

    
3530
#define SRC(x) dct[x][i]
3531
#define DST(x,v) sum += FFABS(v)
3532
    for( i = 0; i < 8; i++ )
3533
        DCT8_1D
3534
#undef SRC
3535
#undef DST
3536
    return sum;
3537
}
3538
#endif
3539

    
3540
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3541
    MpegEncContext * const s= (MpegEncContext *)c;
3542
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3543
    int sum=0, i;
3544

    
3545
    assert(h==8);
3546

    
3547
    s->dsp.diff_pixels(temp, src1, src2, stride);
3548
    s->dsp.fdct(temp);
3549

    
3550
    for(i=0; i<64; i++)
3551
        sum= FFMAX(sum, FFABS(temp[i]));
3552

    
3553
    return sum;
3554
}
3555

    
3556
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3557
    MpegEncContext * const s= (MpegEncContext *)c;
3558
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3559
    DCTELEM * const bak = temp+64;
3560
    int sum=0, i;
3561

    
3562
    assert(h==8);
3563
    s->mb_intra=0;
3564

    
3565
    s->dsp.diff_pixels(temp, src1, src2, stride);
3566

    
3567
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3568

    
3569
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3570
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3571
    ff_simple_idct(temp); //FIXME
3572

    
3573
    for(i=0; i<64; i++)
3574
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3575

    
3576
    return sum;
3577
}
3578

    
3579
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3580
    MpegEncContext * const s= (MpegEncContext *)c;
3581
    const uint8_t *scantable= s->intra_scantable.permutated;
3582
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3583
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3584
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3585
    int i, last, run, bits, level, distortion, start_i;
3586
    const int esc_length= s->ac_esc_length;
3587
    uint8_t * length;
3588
    uint8_t * last_length;
3589

    
3590
    assert(h==8);
3591

    
3592
    copy_block8(lsrc1, src1, 8, stride, 8);
3593
    copy_block8(lsrc2, src2, 8, stride, 8);
3594

    
3595
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3596

    
3597
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3598

    
3599
    bits=0;
3600

    
3601
    if (s->mb_intra) {
3602
        start_i = 1;
3603
        length     = s->intra_ac_vlc_length;
3604
        last_length= s->intra_ac_vlc_last_length;
3605
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3606
    } else {
3607
        start_i = 0;
3608
        length     = s->inter_ac_vlc_length;
3609
        last_length= s->inter_ac_vlc_last_length;
3610
    }
3611

    
3612
    if(last>=start_i){
3613
        run=0;
3614
        for(i=start_i; i<last; i++){
3615
            int j= scantable[i];
3616
            level= temp[j];
3617

    
3618
            if(level){
3619
                level+=64;
3620
                if((level&(~127)) == 0){
3621
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3622
                }else
3623
                    bits+= esc_length;
3624
                run=0;
3625
            }else
3626
                run++;
3627
        }
3628
        i= scantable[last];
3629

    
3630
        level= temp[i] + 64;
3631

    
3632
        assert(level - 64);
3633

    
3634
        if((level&(~127)) == 0){
3635
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3636
        }else
3637
            bits+= esc_length;
3638

    
3639
    }
3640

    
3641
    if(last>=0){
3642
        if(s->mb_intra)
3643
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3644
        else
3645
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3646
    }
3647

    
3648
    s->dsp.idct_add(lsrc2, 8, temp);
3649

    
3650
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3651

    
3652
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3653
}
3654

    
3655
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3656
    MpegEncContext * const s= (MpegEncContext *)c;
3657
    const uint8_t *scantable= s->intra_scantable.permutated;
3658
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3659
    int i, last, run, bits, level, start_i;
3660
    const int esc_length= s->ac_esc_length;
3661
    uint8_t * length;
3662
    uint8_t * last_length;
3663

    
3664
    assert(h==8);
3665

    
3666
    s->dsp.diff_pixels(temp, src1, src2, stride);
3667

    
3668
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3669

    
3670
    bits=0;
3671

    
3672
    if (s->mb_intra) {
3673
        start_i = 1;
3674
        length     = s->intra_ac_vlc_length;
3675
        last_length= s->intra_ac_vlc_last_length;
3676
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3677
    } else {
3678
        start_i = 0;
3679
        length     = s->inter_ac_vlc_length;
3680
        last_length= s->inter_ac_vlc_last_length;
3681
    }
3682

    
3683
    if(last>=start_i){
3684
        run=0;
3685
        for(i=start_i; i<last; i++){
3686
            int j= scantable[i];
3687
            level= temp[j];
3688

    
3689
            if(level){
3690
                level+=64;
3691
                if((level&(~127)) == 0){
3692
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3693
                }else
3694
                    bits+= esc_length;
3695
                run=0;
3696
            }else
3697
                run++;
3698
        }
3699
        i= scantable[last];
3700

    
3701
        level= temp[i] + 64;
3702

    
3703
        assert(level - 64);
3704

    
3705
        if((level&(~127)) == 0){
3706
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3707
        }else
3708
            bits+= esc_length;
3709
    }
3710

    
3711
    return bits;
3712
}
3713

    
3714
#define VSAD_INTRA(size) \
3715
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3716
    int score=0;                                                                                            \
3717
    int x,y;                                                                                                \
3718
                                                                                                            \
3719
    for(y=1; y<h; y++){                                                                                     \
3720
        for(x=0; x<size; x+=4){                                                                             \
3721
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3722
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3723
        }                                                                                                   \
3724
        s+= stride;                                                                                         \
3725
    }                                                                                                       \
3726
                                                                                                            \
3727
    return score;                                                                                           \
3728
}
3729
VSAD_INTRA(8)
3730
VSAD_INTRA(16)
3731

    
3732
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3733
    int score=0;
3734
    int x,y;
3735

    
3736
    for(y=1; y<h; y++){
3737
        for(x=0; x<16; x++){
3738
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3739
        }
3740
        s1+= stride;
3741
        s2+= stride;
3742
    }
3743

    
3744
    return score;
3745
}
3746

    
3747
#define SQ(a) ((a)*(a))
3748
#define VSSE_INTRA(size) \
3749
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3750
    int score=0;                                                                                            \
3751
    int x,y;                                                                                                \
3752
                                                                                                            \
3753
    for(y=1; y<h; y++){                                                                                     \
3754
        for(x=0; x<size; x+=4){                                                                               \
3755
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3756
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3757
        }                                                                                                   \
3758
        s+= stride;                                                                                         \
3759
    }                                                                                                       \
3760
                                                                                                            \
3761
    return score;                                                                                           \
3762
}
3763
VSSE_INTRA(8)
3764
VSSE_INTRA(16)
3765

    
3766
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3767
    int score=0;
3768
    int x,y;
3769

    
3770
    for(y=1; y<h; y++){
3771
        for(x=0; x<16; x++){
3772
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3773
        }
3774
        s1+= stride;
3775
        s2+= stride;
3776
    }
3777

    
3778
    return score;
3779
}
3780

    
3781
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3782
                               int size){
3783
    int score=0;
3784
    int i;
3785
    for(i=0; i<size; i++)
3786
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3787
    return score;
3788
}
3789

    
3790
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3791
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3792
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3793
#if CONFIG_GPL
3794
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3795
#endif
3796
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3797
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3798
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3799
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3800

    
3801
static void vector_fmul_c(float *dst, const float *src, int len){
3802
    int i;
3803
    for(i=0; i<len; i++)
3804
        dst[i] *= src[i];
3805
}
3806

    
3807
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3808
    int i;
3809
    src1 += len-1;
3810
    for(i=0; i<len; i++)
3811
        dst[i] = src0[i] * src1[-i];
3812
}
3813

    
3814
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3815
    int i;
3816
    for(i=0; i<len; i++)
3817
        dst[i] = src0[i] * src1[i] + src2[i];
3818
}
3819

    
3820
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3821
    int i,j;
3822
    dst += len;
3823
    win += len;
3824
    src0+= len;
3825
    for(i=-len, j=len-1; i<0; i++, j--) {
3826
        float s0 = src0[i];
3827
        float s1 = src1[j];
3828
        float wi = win[i];
3829
        float wj = win[j];
3830
        dst[i] = s0*wj - s1*wi + add_bias;
3831
        dst[j] = s0*wi + s1*wj + add_bias;
3832
    }
3833
}
3834

    
3835
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3836
                                 int len)
3837
{
3838
    int i;
3839
    for (i = 0; i < len; i++)
3840
        dst[i] = src[i] * mul;
3841
}
3842

    
3843
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3844
                                      const float **sv, float mul, int len)
3845
{
3846
    int i;
3847
    for (i = 0; i < len; i += 2, sv++) {
3848
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3849
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3850
    }
3851
}
3852

    
3853
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3854
                                      const float **sv, float mul, int len)
3855
{
3856
    int i;
3857
    for (i = 0; i < len; i += 4, sv++) {
3858
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3859
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3860
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3861
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3862
    }
3863
}
3864

    
3865
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3866
                               int len)
3867
{
3868
    int i;
3869
    for (i = 0; i < len; i += 2, sv++) {
3870
        dst[i  ] = sv[0][0] * mul;
3871
        dst[i+1] = sv[0][1] * mul;
3872
    }
3873
}
3874

    
3875
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3876
                               int len)
3877
{
3878
    int i;
3879
    for (i = 0; i < len; i += 4, sv++) {
3880
        dst[i  ] = sv[0][0] * mul;
3881
        dst[i+1] = sv[0][1] * mul;
3882
        dst[i+2] = sv[0][2] * mul;
3883
        dst[i+3] = sv[0][3] * mul;
3884
    }
3885
}
3886

    
3887
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3888
                                int len)
3889
{
3890
    int i;
3891
    for (i = 0; i < len; i++) {
3892
        float t = v1[i] - v2[i];
3893
        v1[i] += v2[i];
3894
        v2[i] = t;
3895
    }
3896
}
3897

    
3898
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3899
{
3900
    float p = 0.0;
3901
    int i;
3902

    
3903
    for (i = 0; i < len; i++)
3904
        p += v1[i] * v2[i];
3905

    
3906
    return p;
3907
}
3908

    
3909
static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3910
    int i;
3911
    for(i=0; i<len; i++)
3912
        dst[i] = src[i] * mul;
3913
}
3914

    
3915
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3916
                   uint32_t maxi, uint32_t maxisign)
3917
{
3918

    
3919
    if(a > mini) return mini;
3920
    else if((a^(1<<31)) > maxisign) return maxi;
3921
    else return a;
3922
}
3923

    
3924
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3925
    int i;
3926
    uint32_t mini = *(uint32_t*)min;
3927
    uint32_t maxi = *(uint32_t*)max;
3928
    uint32_t maxisign = maxi ^ (1<<31);
3929
    uint32_t *dsti = (uint32_t*)dst;
3930
    const uint32_t *srci = (const uint32_t*)src;
3931
    for(i=0; i<len; i+=8) {
3932
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3933
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3934
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3935
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3936
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3937
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3938
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3939
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3940
    }
3941
}
3942
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3943
    int i;
3944
    if(min < 0 && max > 0) {
3945
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3946
    } else {
3947
        for(i=0; i < len; i+=8) {
3948
            dst[i    ] = av_clipf(src[i    ], min, max);
3949
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3950
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3951
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3952
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3953
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3954
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3955
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3956
        }
3957
    }
3958
}
3959

    
3960
static av_always_inline int float_to_int16_one(const float *src){
3961
    int_fast32_t tmp = *(const int32_t*)src;
3962
    if(tmp & 0xf0000){
3963
        tmp = (0x43c0ffff - tmp)>>31;
3964
        // is this faster on some gcc/cpu combinations?
3965
//      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3966
//      else                 tmp = 0;
3967
    }
3968
    return tmp - 0x8000;
3969
}
3970

    
3971
void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3972
    int i;
3973
    for(i=0; i<len; i++)
3974
        dst[i] = float_to_int16_one(src+i);
3975
}
3976

    
3977
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3978
    int i,j,c;
3979
    if(channels==2){
3980
        for(i=0; i<len; i++){
3981
            dst[2*i]   = float_to_int16_one(src[0]+i);
3982
            dst[2*i+1] = float_to_int16_one(src[1]+i);
3983
        }
3984
    }else{
3985
        for(c=0; c<channels; c++)
3986
            for(i=0, j=c; i<len; i++, j+=channels)
3987
                dst[j] = float_to_int16_one(src[c]+i);
3988
    }
3989
}
3990

    
3991
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3992
{
3993
    int res = 0;
3994

    
3995
    while (order--)
3996
        res += (*v1++ * *v2++) >> shift;
3997

    
3998
    return res;
3999
}
4000

    
4001
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4002
{
4003
    int res = 0;
4004
    while (order--) {
4005
        res   += *v1 * *v2++;
4006
        *v1++ += mul * *v3++;
4007
    }
4008
    return res;
4009
}
4010

    
4011
#define W0 2048
4012
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4013
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4014
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4015
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4016
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4017
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4018
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4019

    
4020
static void wmv2_idct_row(short * b)
4021
{
4022
    int s1,s2;
4023
    int a0,a1,a2,a3,a4,a5,a6,a7;
4024
    /*step 1*/
4025
    a1 = W1*b[1]+W7*b[7];
4026
    a7 = W7*b[1]-W1*b[7];
4027
    a5 = W5*b[5]+W3*b[3];
4028
    a3 = W3*b[5]-W5*b[3];
4029
    a2 = W2*b[2]+W6*b[6];
4030
    a6 = W6*b[2]-W2*b[6];
4031
    a0 = W0*b[0]+W0*b[4];
4032
    a4 = W0*b[0]-W0*b[4];
4033
    /*step 2*/
4034
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4035
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4036
    /*step 3*/
4037
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4038
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
4039
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
4040
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4041
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4042
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
4043
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
4044
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4045
}
4046
static void wmv2_idct_col(short * b)
4047
{
4048
    int s1,s2;
4049
    int a0,a1,a2,a3,a4,a5,a6,a7;
4050
    /*step 1, with extended precision*/
4051
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4052
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4053
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4054
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4055
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4056
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4057
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4058
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4059
    /*step 2*/
4060
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
4061
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
4062
    /*step 3*/
4063
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4064
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4065
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4066
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4067

    
4068
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4069
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4070
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4071
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4072
}
4073
void ff_wmv2_idct_c(short * block){
4074
    int i;
4075

    
4076
    for(i=0;i<64;i+=8){
4077
        wmv2_idct_row(block+i);
4078
    }
4079
    for(i=0;i<8;i++){
4080
        wmv2_idct_col(block+i);
4081
    }
4082
}
4083
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4084
 converted */
4085
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4086
{
4087
    ff_wmv2_idct_c(block);
4088
    put_pixels_clamped_c(block, dest, line_size);
4089
}
4090
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4091
{
4092
    ff_wmv2_idct_c(block);
4093
    add_pixels_clamped_c(block, dest, line_size);
4094
}
4095
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4096
{
4097
    j_rev_dct (block);
4098
    put_pixels_clamped_c(block, dest, line_size);
4099
}
4100
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4101
{
4102
    j_rev_dct (block);
4103
    add_pixels_clamped_c(block, dest, line_size);
4104
}
4105

    
4106
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4107
{
4108
    j_rev_dct4 (block);
4109
    put_pixels_clamped4_c(block, dest, line_size);
4110
}
4111
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4112
{
4113
    j_rev_dct4 (block);
4114
    add_pixels_clamped4_c(block, dest, line_size);
4115
}
4116

    
4117
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4118
{
4119
    j_rev_dct2 (block);
4120
    put_pixels_clamped2_c(block, dest, line_size);
4121
}
4122
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4123
{
4124
    j_rev_dct2 (block);
4125
    add_pixels_clamped2_c(block, dest, line_size);
4126
}
4127

    
4128
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4129
{
4130
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4131

    
4132
    dest[0] = cm[(block[0] + 4)>>3];
4133
}
4134
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4135
{
4136
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4137

    
4138
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4139
}
4140

    
4141
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4142

    
4143
/* init static data */
4144
av_cold void dsputil_static_init(void)
4145
{
4146
    int i;
4147

    
4148
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4149
    for(i=0;i<MAX_NEG_CROP;i++) {
4150
        ff_cropTbl[i] = 0;
4151
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4152
    }
4153

    
4154
    for(i=0;i<512;i++) {
4155
        ff_squareTbl[i] = (i - 256) * (i - 256);
4156
    }
4157

    
4158
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4159
}
4160

    
4161
int ff_check_alignment(void){
4162
    static int did_fail=0;
4163
    DECLARE_ALIGNED(16, int, aligned);
4164

    
4165
    if((intptr_t)&aligned & 15){
4166
        if(!did_fail){
4167
#if HAVE_MMX || HAVE_ALTIVEC
4168
            av_log(NULL, AV_LOG_ERROR,
4169
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4170
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4171
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4172
                "Do not report crashes to FFmpeg developers.\n");
4173
#endif
4174
            did_fail=1;
4175
        }
4176
        return -1;
4177
    }
4178
    return 0;
4179
}
4180

    
4181
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4182
{
4183
    int i;
4184

    
4185
    ff_check_alignment();
4186

    
4187
#if CONFIG_ENCODERS
4188
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4189
        c->fdct = fdct_ifast;
4190
        c->fdct248 = fdct_ifast248;
4191
    }
4192
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4193
        c->fdct = ff_faandct;
4194
        c->fdct248 = ff_faandct248;
4195
    }
4196
    else {
4197
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4198
        c->fdct248 = ff_fdct248_islow;
4199
    }
4200
#endif //CONFIG_ENCODERS
4201

    
4202
    if(avctx->lowres==1){
4203
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4204
            c->idct_put= ff_jref_idct4_put;
4205
            c->idct_add= ff_jref_idct4_add;
4206
        }else{
4207
            c->idct_put= ff_h264_lowres_idct_put_c;
4208
            c->idct_add= ff_h264_lowres_idct_add_c;
4209
        }
4210
        c->idct    = j_rev_dct4;
4211
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4212
    }else if(avctx->lowres==2){
4213
        c->idct_put= ff_jref_idct2_put;
4214
        c->idct_add= ff_jref_idct2_add;
4215
        c->idct    = j_rev_dct2;
4216
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4217
    }else if(avctx->lowres==3){
4218
        c->idct_put= ff_jref_idct1_put;
4219
        c->idct_add= ff_jref_idct1_add;
4220
        c->idct    = j_rev_dct1;
4221
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4222
    }else{
4223
        if(avctx->idct_algo==FF_IDCT_INT){
4224
            c->idct_put= ff_jref_idct_put;
4225
            c->idct_add= ff_jref_idct_add;
4226
            c->idct    = j_rev_dct;
4227
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4228
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4229
                avctx->idct_algo==FF_IDCT_VP3){
4230
            c->idct_put= ff_vp3_idct_put_c;
4231
            c->idct_add= ff_vp3_idct_add_c;
4232
            c->idct    = ff_vp3_idct_c;
4233
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4234
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4235
            c->idct_put= ff_wmv2_idct_put_c;
4236
            c->idct_add= ff_wmv2_idct_add_c;
4237
            c->idct    = ff_wmv2_idct_c;
4238
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4239
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4240
            c->idct_put= ff_faanidct_put;
4241
            c->idct_add= ff_faanidct_add;
4242
            c->idct    = ff_faanidct;
4243
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4244
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4245
            c->idct_put= ff_ea_idct_put_c;
4246
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4247
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4248
            c->idct     = ff_bink_idct_c;
4249
            c->idct_add = ff_bink_idct_add_c;
4250
            c->idct_put = ff_bink_idct_put_c;
4251
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4252
        }else{ //accurate/default