Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 12802ec0

History | View | Annotate | Download (154 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* Input permutation for the simple_idct_mmx */
100
static const uint8_t simple_mmx_permutation[64]={
101
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109
};
110

    
111
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
112

    
113
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114
    int i;
115
    int end;
116

    
117
    st->scantable= src_scantable;
118

    
119
    for(i=0; i<64; i++){
120
        int j;
121
        j = src_scantable[i];
122
        st->permutated[i] = permutation[j];
123
#if ARCH_PPC
124
        st->inverse[j] = i;
125
#endif
126
    }
127

    
128
    end=-1;
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = st->permutated[i];
132
        if(j>end) end=j;
133
        st->raster_end[i]= end;
134
    }
135
}
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = ff_squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= av_bswap32(src[i+0]);
212
        dst[i+1]= av_bswap32(src[i+1]);
213
        dst[i+2]= av_bswap32(src[i+2]);
214
        dst[i+3]= av_bswap32(src[i+3]);
215
        dst[i+4]= av_bswap32(src[i+4]);
216
        dst[i+5]= av_bswap32(src[i+5]);
217
        dst[i+6]= av_bswap32(src[i+6]);
218
        dst[i+7]= av_bswap32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
    }
223
}
224

    
225
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226
{
227
    int s, i;
228
    uint32_t *sq = ff_squareTbl + 256;
229

    
230
    s = 0;
231
    for (i = 0; i < h; i++) {
232
        s += sq[pix1[0] - pix2[0]];
233
        s += sq[pix1[1] - pix2[1]];
234
        s += sq[pix1[2] - pix2[2]];
235
        s += sq[pix1[3] - pix2[3]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = ff_squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        s += sq[pix1[4] - pix2[4]];
254
        s += sq[pix1[5] - pix2[5]];
255
        s += sq[pix1[6] - pix2[6]];
256
        s += sq[pix1[7] - pix2[7]];
257
        pix1 += line_size;
258
        pix2 += line_size;
259
    }
260
    return s;
261
}
262

    
263
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = ff_squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[ 0] - pix2[ 0]];
271
        s += sq[pix1[ 1] - pix2[ 1]];
272
        s += sq[pix1[ 2] - pix2[ 2]];
273
        s += sq[pix1[ 3] - pix2[ 3]];
274
        s += sq[pix1[ 4] - pix2[ 4]];
275
        s += sq[pix1[ 5] - pix2[ 5]];
276
        s += sq[pix1[ 6] - pix2[ 6]];
277
        s += sq[pix1[ 7] - pix2[ 7]];
278
        s += sq[pix1[ 8] - pix2[ 8]];
279
        s += sq[pix1[ 9] - pix2[ 9]];
280
        s += sq[pix1[10] - pix2[10]];
281
        s += sq[pix1[11] - pix2[11]];
282
        s += sq[pix1[12] - pix2[12]];
283
        s += sq[pix1[13] - pix2[13]];
284
        s += sq[pix1[14] - pix2[14]];
285
        s += sq[pix1[15] - pix2[15]];
286

    
287
        pix1 += line_size;
288
        pix2 += line_size;
289
    }
290
    return s;
291
}
292

    
293
/* draw the edges of width 'w' of an image of size width, height */
294
//FIXME check that this is ok for mpeg4 interlaced
295
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
296
{
297
    uint8_t *ptr, *last_line;
298
    int i;
299

    
300
    last_line = buf + (height - 1) * wrap;
301
    for(i=0;i<w;i++) {
302
        /* top and bottom */
303
        memcpy(buf - (i + 1) * wrap, buf, width);
304
        memcpy(last_line + (i + 1) * wrap, last_line, width);
305
    }
306
    /* left and right */
307
    ptr = buf;
308
    for(i=0;i<height;i++) {
309
        memset(ptr - w, ptr[0], w);
310
        memset(ptr + width, ptr[width-1], w);
311
        ptr += wrap;
312
    }
313
    /* corners */
314
    for(i=0;i<w;i++) {
315
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
316
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
317
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
318
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
319
    }
320
}
321

    
322
/**
323
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
324
 * @param buf destination buffer
325
 * @param src source buffer
326
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
327
 * @param block_w width of block
328
 * @param block_h height of block
329
 * @param src_x x coordinate of the top left sample of the block in the source buffer
330
 * @param src_y y coordinate of the top left sample of the block in the source buffer
331
 * @param w width of the source buffer
332
 * @param h height of the source buffer
333
 */
334
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
335
                                    int src_x, int src_y, int w, int h){
336
    int x, y;
337
    int start_y, start_x, end_y, end_x;
338

    
339
    if(src_y>= h){
340
        src+= (h-1-src_y)*linesize;
341
        src_y=h-1;
342
    }else if(src_y<=-block_h){
343
        src+= (1-block_h-src_y)*linesize;
344
        src_y=1-block_h;
345
    }
346
    if(src_x>= w){
347
        src+= (w-1-src_x);
348
        src_x=w-1;
349
    }else if(src_x<=-block_w){
350
        src+= (1-block_w-src_x);
351
        src_x=1-block_w;
352
    }
353

    
354
    start_y= FFMAX(0, -src_y);
355
    start_x= FFMAX(0, -src_x);
356
    end_y= FFMIN(block_h, h-src_y);
357
    end_x= FFMIN(block_w, w-src_x);
358
    assert(start_y < end_y && block_h);
359
    assert(start_x < end_x && block_w);
360

    
361
    w    = end_x - start_x;
362
    src += start_y*linesize + start_x;
363
    buf += start_x;
364

    
365
    //top
366
    for(y=0; y<start_y; y++){
367
        memcpy(buf, src, w);
368
        buf += linesize;
369
    }
370

    
371
    // copy existing part
372
    for(; y<end_y; y++){
373
        memcpy(buf, src, w);
374
        src += linesize;
375
        buf += linesize;
376
    }
377

    
378
    //bottom
379
    src -= linesize;
380
    for(; y<block_h; y++){
381
        memcpy(buf, src, w);
382
        buf += linesize;
383
    }
384

    
385
    buf -= block_h * linesize + start_x;
386
    while (block_h--){
387
       //left
388
        for(x=0; x<start_x; x++){
389
            buf[x] = buf[start_x];
390
        }
391

    
392
       //right
393
        for(x=end_x; x<block_w; x++){
394
            buf[x] = buf[end_x - 1];
395
        }
396
        buf += linesize;
397
    }
398
}
399

    
400
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
401
{
402
    int i;
403

    
404
    /* read the pixels */
405
    for(i=0;i<8;i++) {
406
        block[0] = pixels[0];
407
        block[1] = pixels[1];
408
        block[2] = pixels[2];
409
        block[3] = pixels[3];
410
        block[4] = pixels[4];
411
        block[5] = pixels[5];
412
        block[6] = pixels[6];
413
        block[7] = pixels[7];
414
        pixels += line_size;
415
        block += 8;
416
    }
417
}
418

    
419
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
420
                          const uint8_t *s2, int stride){
421
    int i;
422

    
423
    /* read the pixels */
424
    for(i=0;i<8;i++) {
425
        block[0] = s1[0] - s2[0];
426
        block[1] = s1[1] - s2[1];
427
        block[2] = s1[2] - s2[2];
428
        block[3] = s1[3] - s2[3];
429
        block[4] = s1[4] - s2[4];
430
        block[5] = s1[5] - s2[5];
431
        block[6] = s1[6] - s2[6];
432
        block[7] = s1[7] - s2[7];
433
        s1 += stride;
434
        s2 += stride;
435
        block += 8;
436
    }
437
}
438

    
439

    
440
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
441
                                 int line_size)
442
{
443
    int i;
444
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
445

    
446
    /* read the pixels */
447
    for(i=0;i<8;i++) {
448
        pixels[0] = cm[block[0]];
449
        pixels[1] = cm[block[1]];
450
        pixels[2] = cm[block[2]];
451
        pixels[3] = cm[block[3]];
452
        pixels[4] = cm[block[4]];
453
        pixels[5] = cm[block[5]];
454
        pixels[6] = cm[block[6]];
455
        pixels[7] = cm[block[7]];
456

    
457
        pixels += line_size;
458
        block += 8;
459
    }
460
}
461

    
462
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
463
                                 int line_size)
464
{
465
    int i;
466
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
467

    
468
    /* read the pixels */
469
    for(i=0;i<4;i++) {
470
        pixels[0] = cm[block[0]];
471
        pixels[1] = cm[block[1]];
472
        pixels[2] = cm[block[2]];
473
        pixels[3] = cm[block[3]];
474

    
475
        pixels += line_size;
476
        block += 8;
477
    }
478
}
479

    
480
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
481
                                 int line_size)
482
{
483
    int i;
484
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
485

    
486
    /* read the pixels */
487
    for(i=0;i<2;i++) {
488
        pixels[0] = cm[block[0]];
489
        pixels[1] = cm[block[1]];
490

    
491
        pixels += line_size;
492
        block += 8;
493
    }
494
}
495

    
496
static void put_signed_pixels_clamped_c(const DCTELEM *block,
497
                                        uint8_t *restrict pixels,
498
                                        int line_size)
499
{
500
    int i, j;
501

    
502
    for (i = 0; i < 8; i++) {
503
        for (j = 0; j < 8; j++) {
504
            if (*block < -128)
505
                *pixels = 0;
506
            else if (*block > 127)
507
                *pixels = 255;
508
            else
509
                *pixels = (uint8_t)(*block + 128);
510
            block++;
511
            pixels++;
512
        }
513
        pixels += (line_size - 8);
514
    }
515
}
516

    
517
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
518
                                    int line_size)
519
{
520
    int i;
521

    
522
    /* read the pixels */
523
    for(i=0;i<8;i++) {
524
        pixels[0] = block[0];
525
        pixels[1] = block[1];
526
        pixels[2] = block[2];
527
        pixels[3] = block[3];
528
        pixels[4] = block[4];
529
        pixels[5] = block[5];
530
        pixels[6] = block[6];
531
        pixels[7] = block[7];
532

    
533
        pixels += line_size;
534
        block += 8;
535
    }
536
}
537

    
538
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
539
                          int line_size)
540
{
541
    int i;
542
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
543

    
544
    /* read the pixels */
545
    for(i=0;i<8;i++) {
546
        pixels[0] = cm[pixels[0] + block[0]];
547
        pixels[1] = cm[pixels[1] + block[1]];
548
        pixels[2] = cm[pixels[2] + block[2]];
549
        pixels[3] = cm[pixels[3] + block[3]];
550
        pixels[4] = cm[pixels[4] + block[4]];
551
        pixels[5] = cm[pixels[5] + block[5]];
552
        pixels[6] = cm[pixels[6] + block[6]];
553
        pixels[7] = cm[pixels[7] + block[7]];
554
        pixels += line_size;
555
        block += 8;
556
    }
557
}
558

    
559
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
560
                          int line_size)
561
{
562
    int i;
563
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
564

    
565
    /* read the pixels */
566
    for(i=0;i<4;i++) {
567
        pixels[0] = cm[pixels[0] + block[0]];
568
        pixels[1] = cm[pixels[1] + block[1]];
569
        pixels[2] = cm[pixels[2] + block[2]];
570
        pixels[3] = cm[pixels[3] + block[3]];
571
        pixels += line_size;
572
        block += 8;
573
    }
574
}
575

    
576
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
577
                          int line_size)
578
{
579
    int i;
580
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
581

    
582
    /* read the pixels */
583
    for(i=0;i<2;i++) {
584
        pixels[0] = cm[pixels[0] + block[0]];
585
        pixels[1] = cm[pixels[1] + block[1]];
586
        pixels += line_size;
587
        block += 8;
588
    }
589
}
590

    
591
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
592
{
593
    int i;
594
    for(i=0;i<8;i++) {
595
        pixels[0] += block[0];
596
        pixels[1] += block[1];
597
        pixels[2] += block[2];
598
        pixels[3] += block[3];
599
        pixels[4] += block[4];
600
        pixels[5] += block[5];
601
        pixels[6] += block[6];
602
        pixels[7] += block[7];
603
        pixels += line_size;
604
        block += 8;
605
    }
606
}
607

    
608
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
609
{
610
    int i;
611
    for(i=0;i<4;i++) {
612
        pixels[0] += block[0];
613
        pixels[1] += block[1];
614
        pixels[2] += block[2];
615
        pixels[3] += block[3];
616
        pixels += line_size;
617
        block += 4;
618
    }
619
}
620

    
621
static int sum_abs_dctelem_c(DCTELEM *block)
622
{
623
    int sum=0, i;
624
    for(i=0; i<64; i++)
625
        sum+= FFABS(block[i]);
626
    return sum;
627
}
628

    
629
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
630
{
631
    int i;
632

    
633
    for (i = 0; i < h; i++) {
634
        memset(block, value, 16);
635
        block += line_size;
636
    }
637
}
638

    
639
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
640
{
641
    int i;
642

    
643
    for (i = 0; i < h; i++) {
644
        memset(block, value, 8);
645
        block += line_size;
646
    }
647
}
648

    
649
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
650
{
651
    int i, j;
652
    uint16_t *dst1 = (uint16_t *) dst;
653
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
654

    
655
    for (j = 0; j < 8; j++) {
656
        for (i = 0; i < 8; i++) {
657
            dst1[i] = dst2[i] = src[i] * 0x0101;
658
        }
659
        src  += 8;
660
        dst1 += linesize;
661
        dst2 += linesize;
662
    }
663
}
664

    
665
#if 0
666

667
#define PIXOP2(OPNAME, OP) \
668
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
669
{\
670
    int i;\
671
    for(i=0; i<h; i++){\
672
        OP(*((uint64_t*)block), AV_RN64(pixels));\
673
        pixels+=line_size;\
674
        block +=line_size;\
675
    }\
676
}\
677
\
678
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
679
{\
680
    int i;\
681
    for(i=0; i<h; i++){\
682
        const uint64_t a= AV_RN64(pixels  );\
683
        const uint64_t b= AV_RN64(pixels+1);\
684
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
685
        pixels+=line_size;\
686
        block +=line_size;\
687
    }\
688
}\
689
\
690
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
691
{\
692
    int i;\
693
    for(i=0; i<h; i++){\
694
        const uint64_t a= AV_RN64(pixels  );\
695
        const uint64_t b= AV_RN64(pixels+1);\
696
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
697
        pixels+=line_size;\
698
        block +=line_size;\
699
    }\
700
}\
701
\
702
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
703
{\
704
    int i;\
705
    for(i=0; i<h; i++){\
706
        const uint64_t a= AV_RN64(pixels          );\
707
        const uint64_t b= AV_RN64(pixels+line_size);\
708
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
709
        pixels+=line_size;\
710
        block +=line_size;\
711
    }\
712
}\
713
\
714
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
715
{\
716
    int i;\
717
    for(i=0; i<h; i++){\
718
        const uint64_t a= AV_RN64(pixels          );\
719
        const uint64_t b= AV_RN64(pixels+line_size);\
720
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
721
        pixels+=line_size;\
722
        block +=line_size;\
723
    }\
724
}\
725
\
726
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727
{\
728
        int i;\
729
        const uint64_t a= AV_RN64(pixels  );\
730
        const uint64_t b= AV_RN64(pixels+1);\
731
        uint64_t l0=  (a&0x0303030303030303ULL)\
732
                    + (b&0x0303030303030303ULL)\
733
                    + 0x0202020202020202ULL;\
734
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
735
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
736
        uint64_t l1,h1;\
737
\
738
        pixels+=line_size;\
739
        for(i=0; i<h; i+=2){\
740
            uint64_t a= AV_RN64(pixels  );\
741
            uint64_t b= AV_RN64(pixels+1);\
742
            l1=  (a&0x0303030303030303ULL)\
743
               + (b&0x0303030303030303ULL);\
744
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
745
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
746
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
747
            pixels+=line_size;\
748
            block +=line_size;\
749
            a= AV_RN64(pixels  );\
750
            b= AV_RN64(pixels+1);\
751
            l0=  (a&0x0303030303030303ULL)\
752
               + (b&0x0303030303030303ULL)\
753
               + 0x0202020202020202ULL;\
754
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
755
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
756
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
757
            pixels+=line_size;\
758
            block +=line_size;\
759
        }\
760
}\
761
\
762
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
763
{\
764
        int i;\
765
        const uint64_t a= AV_RN64(pixels  );\
766
        const uint64_t b= AV_RN64(pixels+1);\
767
        uint64_t l0=  (a&0x0303030303030303ULL)\
768
                    + (b&0x0303030303030303ULL)\
769
                    + 0x0101010101010101ULL;\
770
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
771
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
772
        uint64_t l1,h1;\
773
\
774
        pixels+=line_size;\
775
        for(i=0; i<h; i+=2){\
776
            uint64_t a= AV_RN64(pixels  );\
777
            uint64_t b= AV_RN64(pixels+1);\
778
            l1=  (a&0x0303030303030303ULL)\
779
               + (b&0x0303030303030303ULL);\
780
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
781
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
782
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
783
            pixels+=line_size;\
784
            block +=line_size;\
785
            a= AV_RN64(pixels  );\
786
            b= AV_RN64(pixels+1);\
787
            l0=  (a&0x0303030303030303ULL)\
788
               + (b&0x0303030303030303ULL)\
789
               + 0x0101010101010101ULL;\
790
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
791
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
792
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
793
            pixels+=line_size;\
794
            block +=line_size;\
795
        }\
796
}\
797
\
798
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
799
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
800
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
801
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
802
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
803
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
804
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
805

806
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
807
#else // 64 bit variant
808

    
809
#define PIXOP2(OPNAME, OP) \
810
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
811
    int i;\
812
    for(i=0; i<h; i++){\
813
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
814
        pixels+=line_size;\
815
        block +=line_size;\
816
    }\
817
}\
818
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
819
    int i;\
820
    for(i=0; i<h; i++){\
821
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
822
        pixels+=line_size;\
823
        block +=line_size;\
824
    }\
825
}\
826
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
827
    int i;\
828
    for(i=0; i<h; i++){\
829
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
830
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
831
        pixels+=line_size;\
832
        block +=line_size;\
833
    }\
834
}\
835
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
837
}\
838
\
839
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
840
                                                int src_stride1, int src_stride2, int h){\
841
    int i;\
842
    for(i=0; i<h; i++){\
843
        uint32_t a,b;\
844
        a= AV_RN32(&src1[i*src_stride1  ]);\
845
        b= AV_RN32(&src2[i*src_stride2  ]);\
846
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
847
        a= AV_RN32(&src1[i*src_stride1+4]);\
848
        b= AV_RN32(&src2[i*src_stride2+4]);\
849
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
850
    }\
851
}\
852
\
853
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
854
                                                int src_stride1, int src_stride2, int h){\
855
    int i;\
856
    for(i=0; i<h; i++){\
857
        uint32_t a,b;\
858
        a= AV_RN32(&src1[i*src_stride1  ]);\
859
        b= AV_RN32(&src2[i*src_stride2  ]);\
860
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
861
        a= AV_RN32(&src1[i*src_stride1+4]);\
862
        b= AV_RN32(&src2[i*src_stride2+4]);\
863
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
864
    }\
865
}\
866
\
867
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
868
                                                int src_stride1, int src_stride2, int h){\
869
    int i;\
870
    for(i=0; i<h; i++){\
871
        uint32_t a,b;\
872
        a= AV_RN32(&src1[i*src_stride1  ]);\
873
        b= AV_RN32(&src2[i*src_stride2  ]);\
874
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
875
    }\
876
}\
877
\
878
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
879
                                                int src_stride1, int src_stride2, int h){\
880
    int i;\
881
    for(i=0; i<h; i++){\
882
        uint32_t a,b;\
883
        a= AV_RN16(&src1[i*src_stride1  ]);\
884
        b= AV_RN16(&src2[i*src_stride2  ]);\
885
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
886
    }\
887
}\
888
\
889
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890
                                                int src_stride1, int src_stride2, int h){\
891
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
892
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
893
}\
894
\
895
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
896
                                                int src_stride1, int src_stride2, int h){\
897
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
898
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
899
}\
900
\
901
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
903
}\
904
\
905
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
907
}\
908
\
909
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
910
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911
}\
912
\
913
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
915
}\
916
\
917
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
918
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
919
    int i;\
920
    for(i=0; i<h; i++){\
921
        uint32_t a, b, c, d, l0, l1, h0, h1;\
922
        a= AV_RN32(&src1[i*src_stride1]);\
923
        b= AV_RN32(&src2[i*src_stride2]);\
924
        c= AV_RN32(&src3[i*src_stride3]);\
925
        d= AV_RN32(&src4[i*src_stride4]);\
926
        l0=  (a&0x03030303UL)\
927
           + (b&0x03030303UL)\
928
           + 0x02020202UL;\
929
        h0= ((a&0xFCFCFCFCUL)>>2)\
930
          + ((b&0xFCFCFCFCUL)>>2);\
931
        l1=  (c&0x03030303UL)\
932
           + (d&0x03030303UL);\
933
        h1= ((c&0xFCFCFCFCUL)>>2)\
934
          + ((d&0xFCFCFCFCUL)>>2);\
935
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
936
        a= AV_RN32(&src1[i*src_stride1+4]);\
937
        b= AV_RN32(&src2[i*src_stride2+4]);\
938
        c= AV_RN32(&src3[i*src_stride3+4]);\
939
        d= AV_RN32(&src4[i*src_stride4+4]);\
940
        l0=  (a&0x03030303UL)\
941
           + (b&0x03030303UL)\
942
           + 0x02020202UL;\
943
        h0= ((a&0xFCFCFCFCUL)>>2)\
944
          + ((b&0xFCFCFCFCUL)>>2);\
945
        l1=  (c&0x03030303UL)\
946
           + (d&0x03030303UL);\
947
        h1= ((c&0xFCFCFCFCUL)>>2)\
948
          + ((d&0xFCFCFCFCUL)>>2);\
949
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
950
    }\
951
}\
952
\
953
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
954
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
955
}\
956
\
957
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
959
}\
960
\
961
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
962
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
963
}\
964
\
965
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
967
}\
968
\
969
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
970
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
971
    int i;\
972
    for(i=0; i<h; i++){\
973
        uint32_t a, b, c, d, l0, l1, h0, h1;\
974
        a= AV_RN32(&src1[i*src_stride1]);\
975
        b= AV_RN32(&src2[i*src_stride2]);\
976
        c= AV_RN32(&src3[i*src_stride3]);\
977
        d= AV_RN32(&src4[i*src_stride4]);\
978
        l0=  (a&0x03030303UL)\
979
           + (b&0x03030303UL)\
980
           + 0x01010101UL;\
981
        h0= ((a&0xFCFCFCFCUL)>>2)\
982
          + ((b&0xFCFCFCFCUL)>>2);\
983
        l1=  (c&0x03030303UL)\
984
           + (d&0x03030303UL);\
985
        h1= ((c&0xFCFCFCFCUL)>>2)\
986
          + ((d&0xFCFCFCFCUL)>>2);\
987
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
988
        a= AV_RN32(&src1[i*src_stride1+4]);\
989
        b= AV_RN32(&src2[i*src_stride2+4]);\
990
        c= AV_RN32(&src3[i*src_stride3+4]);\
991
        d= AV_RN32(&src4[i*src_stride4+4]);\
992
        l0=  (a&0x03030303UL)\
993
           + (b&0x03030303UL)\
994
           + 0x01010101UL;\
995
        h0= ((a&0xFCFCFCFCUL)>>2)\
996
          + ((b&0xFCFCFCFCUL)>>2);\
997
        l1=  (c&0x03030303UL)\
998
           + (d&0x03030303UL);\
999
        h1= ((c&0xFCFCFCFCUL)>>2)\
1000
          + ((d&0xFCFCFCFCUL)>>2);\
1001
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1002
    }\
1003
}\
1004
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1008
}\
1009
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1010
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1011
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1012
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1013
}\
1014
\
1015
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1016
{\
1017
        int i, a0, b0, a1, b1;\
1018
        a0= pixels[0];\
1019
        b0= pixels[1] + 2;\
1020
        a0 += b0;\
1021
        b0 += pixels[2];\
1022
\
1023
        pixels+=line_size;\
1024
        for(i=0; i<h; i+=2){\
1025
            a1= pixels[0];\
1026
            b1= pixels[1];\
1027
            a1 += b1;\
1028
            b1 += pixels[2];\
1029
\
1030
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1031
            block[1]= (b1+b0)>>2;\
1032
\
1033
            pixels+=line_size;\
1034
            block +=line_size;\
1035
\
1036
            a0= pixels[0];\
1037
            b0= pixels[1] + 2;\
1038
            a0 += b0;\
1039
            b0 += pixels[2];\
1040
\
1041
            block[0]= (a1+a0)>>2;\
1042
            block[1]= (b1+b0)>>2;\
1043
            pixels+=line_size;\
1044
            block +=line_size;\
1045
        }\
1046
}\
1047
\
1048
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1049
{\
1050
        int i;\
1051
        const uint32_t a= AV_RN32(pixels  );\
1052
        const uint32_t b= AV_RN32(pixels+1);\
1053
        uint32_t l0=  (a&0x03030303UL)\
1054
                    + (b&0x03030303UL)\
1055
                    + 0x02020202UL;\
1056
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1057
                   + ((b&0xFCFCFCFCUL)>>2);\
1058
        uint32_t l1,h1;\
1059
\
1060
        pixels+=line_size;\
1061
        for(i=0; i<h; i+=2){\
1062
            uint32_t a= AV_RN32(pixels  );\
1063
            uint32_t b= AV_RN32(pixels+1);\
1064
            l1=  (a&0x03030303UL)\
1065
               + (b&0x03030303UL);\
1066
            h1= ((a&0xFCFCFCFCUL)>>2)\
1067
              + ((b&0xFCFCFCFCUL)>>2);\
1068
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069
            pixels+=line_size;\
1070
            block +=line_size;\
1071
            a= AV_RN32(pixels  );\
1072
            b= AV_RN32(pixels+1);\
1073
            l0=  (a&0x03030303UL)\
1074
               + (b&0x03030303UL)\
1075
               + 0x02020202UL;\
1076
            h0= ((a&0xFCFCFCFCUL)>>2)\
1077
              + ((b&0xFCFCFCFCUL)>>2);\
1078
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1079
            pixels+=line_size;\
1080
            block +=line_size;\
1081
        }\
1082
}\
1083
\
1084
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1085
{\
1086
    int j;\
1087
    for(j=0; j<2; j++){\
1088
        int i;\
1089
        const uint32_t a= AV_RN32(pixels  );\
1090
        const uint32_t b= AV_RN32(pixels+1);\
1091
        uint32_t l0=  (a&0x03030303UL)\
1092
                    + (b&0x03030303UL)\
1093
                    + 0x02020202UL;\
1094
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1095
                   + ((b&0xFCFCFCFCUL)>>2);\
1096
        uint32_t l1,h1;\
1097
\
1098
        pixels+=line_size;\
1099
        for(i=0; i<h; i+=2){\
1100
            uint32_t a= AV_RN32(pixels  );\
1101
            uint32_t b= AV_RN32(pixels+1);\
1102
            l1=  (a&0x03030303UL)\
1103
               + (b&0x03030303UL);\
1104
            h1= ((a&0xFCFCFCFCUL)>>2)\
1105
              + ((b&0xFCFCFCFCUL)>>2);\
1106
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1107
            pixels+=line_size;\
1108
            block +=line_size;\
1109
            a= AV_RN32(pixels  );\
1110
            b= AV_RN32(pixels+1);\
1111
            l0=  (a&0x03030303UL)\
1112
               + (b&0x03030303UL)\
1113
               + 0x02020202UL;\
1114
            h0= ((a&0xFCFCFCFCUL)>>2)\
1115
              + ((b&0xFCFCFCFCUL)>>2);\
1116
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1117
            pixels+=line_size;\
1118
            block +=line_size;\
1119
        }\
1120
        pixels+=4-line_size*(h+1);\
1121
        block +=4-line_size*h;\
1122
    }\
1123
}\
1124
\
1125
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1126
{\
1127
    int j;\
1128
    for(j=0; j<2; j++){\
1129
        int i;\
1130
        const uint32_t a= AV_RN32(pixels  );\
1131
        const uint32_t b= AV_RN32(pixels+1);\
1132
        uint32_t l0=  (a&0x03030303UL)\
1133
                    + (b&0x03030303UL)\
1134
                    + 0x01010101UL;\
1135
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1136
                   + ((b&0xFCFCFCFCUL)>>2);\
1137
        uint32_t l1,h1;\
1138
\
1139
        pixels+=line_size;\
1140
        for(i=0; i<h; i+=2){\
1141
            uint32_t a= AV_RN32(pixels  );\
1142
            uint32_t b= AV_RN32(pixels+1);\
1143
            l1=  (a&0x03030303UL)\
1144
               + (b&0x03030303UL);\
1145
            h1= ((a&0xFCFCFCFCUL)>>2)\
1146
              + ((b&0xFCFCFCFCUL)>>2);\
1147
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148
            pixels+=line_size;\
1149
            block +=line_size;\
1150
            a= AV_RN32(pixels  );\
1151
            b= AV_RN32(pixels+1);\
1152
            l0=  (a&0x03030303UL)\
1153
               + (b&0x03030303UL)\
1154
               + 0x01010101UL;\
1155
            h0= ((a&0xFCFCFCFCUL)>>2)\
1156
              + ((b&0xFCFCFCFCUL)>>2);\
1157
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1158
            pixels+=line_size;\
1159
            block +=line_size;\
1160
        }\
1161
        pixels+=4-line_size*(h+1);\
1162
        block +=4-line_size*h;\
1163
    }\
1164
}\
1165
\
1166
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1167
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1168
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1169
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1170
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1171
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1172
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1173
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1174

    
1175
#define op_avg(a, b) a = rnd_avg32(a, b)
1176
#endif
1177
#define op_put(a, b) a = b
1178

    
1179
PIXOP2(avg, op_avg)
1180
PIXOP2(put, op_put)
1181
#undef op_avg
1182
#undef op_put
1183

    
1184
#define put_no_rnd_pixels8_c  put_pixels8_c
1185
#define put_no_rnd_pixels16_c put_pixels16_c
1186

    
1187
#define avg2(a,b) ((a+b+1)>>1)
1188
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1189

    
1190
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1191
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1192
}
1193

    
1194
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1195
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1196
}
1197

    
1198
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1199
{
1200
    const int A=(16-x16)*(16-y16);
1201
    const int B=(   x16)*(16-y16);
1202
    const int C=(16-x16)*(   y16);
1203
    const int D=(   x16)*(   y16);
1204
    int i;
1205

    
1206
    for(i=0; i<h; i++)
1207
    {
1208
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1209
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1210
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1211
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1212
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1213
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1214
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1215
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1216
        dst+= stride;
1217
        src+= stride;
1218
    }
1219
}
1220

    
1221
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1222
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1223
{
1224
    int y, vx, vy;
1225
    const int s= 1<<shift;
1226

    
1227
    width--;
1228
    height--;
1229

    
1230
    for(y=0; y<h; y++){
1231
        int x;
1232

    
1233
        vx= ox;
1234
        vy= oy;
1235
        for(x=0; x<8; x++){ //XXX FIXME optimize
1236
            int src_x, src_y, frac_x, frac_y, index;
1237

    
1238
            src_x= vx>>16;
1239
            src_y= vy>>16;
1240
            frac_x= src_x&(s-1);
1241
            frac_y= src_y&(s-1);
1242
            src_x>>=shift;
1243
            src_y>>=shift;
1244

    
1245
            if((unsigned)src_x < width){
1246
                if((unsigned)src_y < height){
1247
                    index= src_x + src_y*stride;
1248
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1249
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1250
                                        + (  src[index+stride  ]*(s-frac_x)
1251
                                           + src[index+stride+1]*   frac_x )*   frac_y
1252
                                        + r)>>(shift*2);
1253
                }else{
1254
                    index= src_x + av_clip(src_y, 0, height)*stride;
1255
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1256
                                          + src[index       +1]*   frac_x )*s
1257
                                        + r)>>(shift*2);
1258
                }
1259
            }else{
1260
                if((unsigned)src_y < height){
1261
                    index= av_clip(src_x, 0, width) + src_y*stride;
1262
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1263
                                           + src[index+stride  ]*   frac_y )*s
1264
                                        + r)>>(shift*2);
1265
                }else{
1266
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1267
                    dst[y*stride + x]=    src[index         ];
1268
                }
1269
            }
1270

    
1271
            vx+= dxx;
1272
            vy+= dyx;
1273
        }
1274
        ox += dxy;
1275
        oy += dyy;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    switch(width){
1281
    case 2: put_pixels2_c (dst, src, stride, height); break;
1282
    case 4: put_pixels4_c (dst, src, stride, height); break;
1283
    case 8: put_pixels8_c (dst, src, stride, height); break;
1284
    case 16:put_pixels16_c(dst, src, stride, height); break;
1285
    }
1286
}
1287

    
1288
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1289
    int i,j;
1290
    for (i=0; i < height; i++) {
1291
      for (j=0; j < width; j++) {
1292
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1293
      }
1294
      src += stride;
1295
      dst += stride;
1296
    }
1297
}
1298

    
1299
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300
    int i,j;
1301
    for (i=0; i < height; i++) {
1302
      for (j=0; j < width; j++) {
1303
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1304
      }
1305
      src += stride;
1306
      dst += stride;
1307
    }
1308
}
1309

    
1310
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311
    int i,j;
1312
    for (i=0; i < height; i++) {
1313
      for (j=0; j < width; j++) {
1314
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1315
      }
1316
      src += stride;
1317
      dst += stride;
1318
    }
1319
}
1320

    
1321
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    int i,j;
1323
    for (i=0; i < height; i++) {
1324
      for (j=0; j < width; j++) {
1325
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1326
      }
1327
      src += stride;
1328
      dst += stride;
1329
    }
1330
}
1331

    
1332
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    switch(width){
1378
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1379
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1380
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1381
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1382
    }
1383
}
1384

    
1385
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386
    int i,j;
1387
    for (i=0; i < height; i++) {
1388
      for (j=0; j < width; j++) {
1389
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1390
      }
1391
      src += stride;
1392
      dst += stride;
1393
    }
1394
}
1395

    
1396
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397
    int i,j;
1398
    for (i=0; i < height; i++) {
1399
      for (j=0; j < width; j++) {
1400
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1401
      }
1402
      src += stride;
1403
      dst += stride;
1404
    }
1405
}
1406

    
1407
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408
    int i,j;
1409
    for (i=0; i < height; i++) {
1410
      for (j=0; j < width; j++) {
1411
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1412
      }
1413
      src += stride;
1414
      dst += stride;
1415
    }
1416
}
1417

    
1418
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419
    int i,j;
1420
    for (i=0; i < height; i++) {
1421
      for (j=0; j < width; j++) {
1422
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1423
      }
1424
      src += stride;
1425
      dst += stride;
1426
    }
1427
}
1428

    
1429
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430
    int i,j;
1431
    for (i=0; i < height; i++) {
1432
      for (j=0; j < width; j++) {
1433
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1434
      }
1435
      src += stride;
1436
      dst += stride;
1437
    }
1438
}
1439

    
1440
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441
    int i,j;
1442
    for (i=0; i < height; i++) {
1443
      for (j=0; j < width; j++) {
1444
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1445
      }
1446
      src += stride;
1447
      dst += stride;
1448
    }
1449
}
1450

    
1451
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452
    int i,j;
1453
    for (i=0; i < height; i++) {
1454
      for (j=0; j < width; j++) {
1455
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1456
      }
1457
      src += stride;
1458
      dst += stride;
1459
    }
1460
}
1461

    
1462
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1463
    int i,j;
1464
    for (i=0; i < height; i++) {
1465
      for (j=0; j < width; j++) {
1466
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1467
      }
1468
      src += stride;
1469
      dst += stride;
1470
    }
1471
}
1472
#if 0
1473
#define TPEL_WIDTH(width)\
1474
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1476
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1478
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1480
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1482
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1484
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1486
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1488
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1490
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1492
#endif
1493

    
1494
#define H264_CHROMA_MC(OPNAME, OP)\
1495
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1496
    const int A=(8-x)*(8-y);\
1497
    const int B=(  x)*(8-y);\
1498
    const int C=(8-x)*(  y);\
1499
    const int D=(  x)*(  y);\
1500
    int i;\
1501
    \
1502
    assert(x<8 && y<8 && x>=0 && y>=0);\
1503
\
1504
    if(D){\
1505
        for(i=0; i<h; i++){\
1506
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1507
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1508
            dst+= stride;\
1509
            src+= stride;\
1510
        }\
1511
    }else{\
1512
        const int E= B+C;\
1513
        const int step= C ? stride : 1;\
1514
        for(i=0; i<h; i++){\
1515
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1516
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1517
            dst+= stride;\
1518
            src+= stride;\
1519
        }\
1520
    }\
1521
}\
1522
\
1523
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524
    const int A=(8-x)*(8-y);\
1525
    const int B=(  x)*(8-y);\
1526
    const int C=(8-x)*(  y);\
1527
    const int D=(  x)*(  y);\
1528
    int i;\
1529
    \
1530
    assert(x<8 && y<8 && x>=0 && y>=0);\
1531
\
1532
    if(D){\
1533
        for(i=0; i<h; i++){\
1534
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1537
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1538
            dst+= stride;\
1539
            src+= stride;\
1540
        }\
1541
    }else{\
1542
        const int E= B+C;\
1543
        const int step= C ? stride : 1;\
1544
        for(i=0; i<h; i++){\
1545
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1546
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1547
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1548
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1549
            dst+= stride;\
1550
            src+= stride;\
1551
        }\
1552
    }\
1553
}\
1554
\
1555
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1556
    const int A=(8-x)*(8-y);\
1557
    const int B=(  x)*(8-y);\
1558
    const int C=(8-x)*(  y);\
1559
    const int D=(  x)*(  y);\
1560
    int i;\
1561
    \
1562
    assert(x<8 && y<8 && x>=0 && y>=0);\
1563
\
1564
    if(D){\
1565
        for(i=0; i<h; i++){\
1566
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1567
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1568
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1569
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1570
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1571
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1572
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1573
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1574
            dst+= stride;\
1575
            src+= stride;\
1576
        }\
1577
    }else{\
1578
        const int E= B+C;\
1579
        const int step= C ? stride : 1;\
1580
        for(i=0; i<h; i++){\
1581
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1582
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1583
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1584
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1585
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1586
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1587
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1588
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1589
            dst+= stride;\
1590
            src+= stride;\
1591
        }\
1592
    }\
1593
}
1594

    
1595
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1596
#define op_put(a, b) a = (((b) + 32)>>6)
1597

    
1598
H264_CHROMA_MC(put_       , op_put)
1599
H264_CHROMA_MC(avg_       , op_avg)
1600
#undef op_avg
1601
#undef op_put
1602

    
1603
#define QPEL_MC(r, OPNAME, RND, OP) \
1604
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1605
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1606
    int i;\
1607
    for(i=0; i<h; i++)\
1608
    {\
1609
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1610
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1611
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1612
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1613
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1614
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1615
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1616
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1617
        dst+=dstStride;\
1618
        src+=srcStride;\
1619
    }\
1620
}\
1621
\
1622
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1623
    const int w=8;\
1624
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1625
    int i;\
1626
    for(i=0; i<w; i++)\
1627
    {\
1628
        const int src0= src[0*srcStride];\
1629
        const int src1= src[1*srcStride];\
1630
        const int src2= src[2*srcStride];\
1631
        const int src3= src[3*srcStride];\
1632
        const int src4= src[4*srcStride];\
1633
        const int src5= src[5*srcStride];\
1634
        const int src6= src[6*srcStride];\
1635
        const int src7= src[7*srcStride];\
1636
        const int src8= src[8*srcStride];\
1637
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1638
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1639
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1640
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1641
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1642
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1643
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1644
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1645
        dst++;\
1646
        src++;\
1647
    }\
1648
}\
1649
\
1650
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1651
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1652
    int i;\
1653
    \
1654
    for(i=0; i<h; i++)\
1655
    {\
1656
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1657
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1658
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1659
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1660
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1661
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1662
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1663
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1664
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1665
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1666
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1667
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1668
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1669
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1670
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1671
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1672
        dst+=dstStride;\
1673
        src+=srcStride;\
1674
    }\
1675
}\
1676
\
1677
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1678
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1679
    int i;\
1680
    const int w=16;\
1681
    for(i=0; i<w; i++)\
1682
    {\
1683
        const int src0= src[0*srcStride];\
1684
        const int src1= src[1*srcStride];\
1685
        const int src2= src[2*srcStride];\
1686
        const int src3= src[3*srcStride];\
1687
        const int src4= src[4*srcStride];\
1688
        const int src5= src[5*srcStride];\
1689
        const int src6= src[6*srcStride];\
1690
        const int src7= src[7*srcStride];\
1691
        const int src8= src[8*srcStride];\
1692
        const int src9= src[9*srcStride];\
1693
        const int src10= src[10*srcStride];\
1694
        const int src11= src[11*srcStride];\
1695
        const int src12= src[12*srcStride];\
1696
        const int src13= src[13*srcStride];\
1697
        const int src14= src[14*srcStride];\
1698
        const int src15= src[15*srcStride];\
1699
        const int src16= src[16*srcStride];\
1700
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1701
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1702
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1703
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1704
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1705
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1706
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1707
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1708
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1709
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1710
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1711
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1712
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1713
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1714
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1715
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1716
        dst++;\
1717
        src++;\
1718
    }\
1719
}\
1720
\
1721
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1722
    uint8_t half[64];\
1723
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1724
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1725
}\
1726
\
1727
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1728
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1729
}\
1730
\
1731
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1732
    uint8_t half[64];\
1733
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1734
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1735
}\
1736
\
1737
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1738
    uint8_t full[16*9];\
1739
    uint8_t half[64];\
1740
    copy_block9(full, src, 16, stride, 9);\
1741
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1742
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1743
}\
1744
\
1745
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1746
    uint8_t full[16*9];\
1747
    copy_block9(full, src, 16, stride, 9);\
1748
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1749
}\
1750
\
1751
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1752
    uint8_t full[16*9];\
1753
    uint8_t half[64];\
1754
    copy_block9(full, src, 16, stride, 9);\
1755
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1756
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1757
}\
1758
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1759
    uint8_t full[16*9];\
1760
    uint8_t halfH[72];\
1761
    uint8_t halfV[64];\
1762
    uint8_t halfHV[64];\
1763
    copy_block9(full, src, 16, stride, 9);\
1764
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1766
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1768
}\
1769
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1770
    uint8_t full[16*9];\
1771
    uint8_t halfH[72];\
1772
    uint8_t halfHV[64];\
1773
    copy_block9(full, src, 16, stride, 9);\
1774
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1776
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1778
}\
1779
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780
    uint8_t full[16*9];\
1781
    uint8_t halfH[72];\
1782
    uint8_t halfV[64];\
1783
    uint8_t halfHV[64];\
1784
    copy_block9(full, src, 16, stride, 9);\
1785
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1787
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1789
}\
1790
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1791
    uint8_t full[16*9];\
1792
    uint8_t halfH[72];\
1793
    uint8_t halfHV[64];\
1794
    copy_block9(full, src, 16, stride, 9);\
1795
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1796
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1797
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1799
}\
1800
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801
    uint8_t full[16*9];\
1802
    uint8_t halfH[72];\
1803
    uint8_t halfV[64];\
1804
    uint8_t halfHV[64];\
1805
    copy_block9(full, src, 16, stride, 9);\
1806
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1810
}\
1811
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t full[16*9];\
1813
    uint8_t halfH[72];\
1814
    uint8_t halfHV[64];\
1815
    copy_block9(full, src, 16, stride, 9);\
1816
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1818
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820
}\
1821
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822
    uint8_t full[16*9];\
1823
    uint8_t halfH[72];\
1824
    uint8_t halfV[64];\
1825
    uint8_t halfHV[64];\
1826
    copy_block9(full, src, 16, stride, 9);\
1827
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1828
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1829
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1831
}\
1832
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1833
    uint8_t full[16*9];\
1834
    uint8_t halfH[72];\
1835
    uint8_t halfHV[64];\
1836
    copy_block9(full, src, 16, stride, 9);\
1837
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1838
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1839
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1841
}\
1842
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1843
    uint8_t halfH[72];\
1844
    uint8_t halfHV[64];\
1845
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1846
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1848
}\
1849
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1850
    uint8_t halfH[72];\
1851
    uint8_t halfHV[64];\
1852
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1853
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1854
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1855
}\
1856
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1857
    uint8_t full[16*9];\
1858
    uint8_t halfH[72];\
1859
    uint8_t halfV[64];\
1860
    uint8_t halfHV[64];\
1861
    copy_block9(full, src, 16, stride, 9);\
1862
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1863
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1864
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1866
}\
1867
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1868
    uint8_t full[16*9];\
1869
    uint8_t halfH[72];\
1870
    copy_block9(full, src, 16, stride, 9);\
1871
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1873
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1874
}\
1875
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1876
    uint8_t full[16*9];\
1877
    uint8_t halfH[72];\
1878
    uint8_t halfV[64];\
1879
    uint8_t halfHV[64];\
1880
    copy_block9(full, src, 16, stride, 9);\
1881
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1883
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1885
}\
1886
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t full[16*9];\
1888
    uint8_t halfH[72];\
1889
    copy_block9(full, src, 16, stride, 9);\
1890
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1891
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1892
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1893
}\
1894
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1895
    uint8_t halfH[72];\
1896
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1897
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1898
}\
1899
\
1900
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1901
    uint8_t half[256];\
1902
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1903
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1904
}\
1905
\
1906
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1907
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1908
}\
1909
\
1910
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1911
    uint8_t half[256];\
1912
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1913
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1914
}\
1915
\
1916
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1917
    uint8_t full[24*17];\
1918
    uint8_t half[256];\
1919
    copy_block17(full, src, 24, stride, 17);\
1920
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1921
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1922
}\
1923
\
1924
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1925
    uint8_t full[24*17];\
1926
    copy_block17(full, src, 24, stride, 17);\
1927
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1928
}\
1929
\
1930
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1931
    uint8_t full[24*17];\
1932
    uint8_t half[256];\
1933
    copy_block17(full, src, 24, stride, 17);\
1934
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1935
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1936
}\
1937
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1938
    uint8_t full[24*17];\
1939
    uint8_t halfH[272];\
1940
    uint8_t halfV[256];\
1941
    uint8_t halfHV[256];\
1942
    copy_block17(full, src, 24, stride, 17);\
1943
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1944
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1945
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1946
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1947
}\
1948
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t full[24*17];\
1950
    uint8_t halfH[272];\
1951
    uint8_t halfHV[256];\
1952
    copy_block17(full, src, 24, stride, 17);\
1953
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1955
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1957
}\
1958
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959
    uint8_t full[24*17];\
1960
    uint8_t halfH[272];\
1961
    uint8_t halfV[256];\
1962
    uint8_t halfHV[256];\
1963
    copy_block17(full, src, 24, stride, 17);\
1964
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1966
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1968
}\
1969
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1970
    uint8_t full[24*17];\
1971
    uint8_t halfH[272];\
1972
    uint8_t halfHV[256];\
1973
    copy_block17(full, src, 24, stride, 17);\
1974
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1975
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1976
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1978
}\
1979
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1980
    uint8_t full[24*17];\
1981
    uint8_t halfH[272];\
1982
    uint8_t halfV[256];\
1983
    uint8_t halfHV[256];\
1984
    copy_block17(full, src, 24, stride, 17);\
1985
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1987
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1989
}\
1990
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1991
    uint8_t full[24*17];\
1992
    uint8_t halfH[272];\
1993
    uint8_t halfHV[256];\
1994
    copy_block17(full, src, 24, stride, 17);\
1995
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1997
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1999
}\
2000
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2001
    uint8_t full[24*17];\
2002
    uint8_t halfH[272];\
2003
    uint8_t halfV[256];\
2004
    uint8_t halfHV[256];\
2005
    copy_block17(full, src, 24, stride, 17);\
2006
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2007
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2008
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2010
}\
2011
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2012
    uint8_t full[24*17];\
2013
    uint8_t halfH[272];\
2014
    uint8_t halfHV[256];\
2015
    copy_block17(full, src, 24, stride, 17);\
2016
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2017
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2018
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2019
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2020
}\
2021
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2022
    uint8_t halfH[272];\
2023
    uint8_t halfHV[256];\
2024
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2025
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2026
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2027
}\
2028
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2029
    uint8_t halfH[272];\
2030
    uint8_t halfHV[256];\
2031
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2032
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2034
}\
2035
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2036
    uint8_t full[24*17];\
2037
    uint8_t halfH[272];\
2038
    uint8_t halfV[256];\
2039
    uint8_t halfHV[256];\
2040
    copy_block17(full, src, 24, stride, 17);\
2041
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2043
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2045
}\
2046
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2047
    uint8_t full[24*17];\
2048
    uint8_t halfH[272];\
2049
    copy_block17(full, src, 24, stride, 17);\
2050
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2051
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2052
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2053
}\
2054
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2055
    uint8_t full[24*17];\
2056
    uint8_t halfH[272];\
2057
    uint8_t halfV[256];\
2058
    uint8_t halfHV[256];\
2059
    copy_block17(full, src, 24, stride, 17);\
2060
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2061
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2062
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2063
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2064
}\
2065
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2066
    uint8_t full[24*17];\
2067
    uint8_t halfH[272];\
2068
    copy_block17(full, src, 24, stride, 17);\
2069
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2070
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2071
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2072
}\
2073
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2074
    uint8_t halfH[272];\
2075
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2076
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2077
}
2078

    
2079
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2080
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2081
#define op_put(a, b) a = cm[((b) + 16)>>5]
2082
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2083

    
2084
QPEL_MC(0, put_       , _       , op_put)
2085
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2086
QPEL_MC(0, avg_       , _       , op_avg)
2087
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2088
#undef op_avg
2089
#undef op_avg_no_rnd
2090
#undef op_put
2091
#undef op_put_no_rnd
2092

    
2093
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
2094
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2095
#define put_qpel16_mc00_c ff_put_pixels16x16_c
2096
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2097
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2098
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2099

    
2100
#if 1
2101
#define H264_LOWPASS(OPNAME, OP, OP2) \
2102
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2103
    const int h=2;\
2104
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2105
    int i;\
2106
    for(i=0; i<h; i++)\
2107
    {\
2108
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2109
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2110
        dst+=dstStride;\
2111
        src+=srcStride;\
2112
    }\
2113
}\
2114
\
2115
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2116
    const int w=2;\
2117
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2118
    int i;\
2119
    for(i=0; i<w; i++)\
2120
    {\
2121
        const int srcB= src[-2*srcStride];\
2122
        const int srcA= src[-1*srcStride];\
2123
        const int src0= src[0 *srcStride];\
2124
        const int src1= src[1 *srcStride];\
2125
        const int src2= src[2 *srcStride];\
2126
        const int src3= src[3 *srcStride];\
2127
        const int src4= src[4 *srcStride];\
2128
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2129
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2130
        dst++;\
2131
        src++;\
2132
    }\
2133
}\
2134
\
2135
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2136
    const int h=2;\
2137
    const int w=2;\
2138
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2139
    int i;\
2140
    src -= 2*srcStride;\
2141
    for(i=0; i<h+5; i++)\
2142
    {\
2143
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2144
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2145
        tmp+=tmpStride;\
2146
        src+=srcStride;\
2147
    }\
2148
    tmp -= tmpStride*(h+5-2);\
2149
    for(i=0; i<w; i++)\
2150
    {\
2151
        const int tmpB= tmp[-2*tmpStride];\
2152
        const int tmpA= tmp[-1*tmpStride];\
2153
        const int tmp0= tmp[0 *tmpStride];\
2154
        const int tmp1= tmp[1 *tmpStride];\
2155
        const int tmp2= tmp[2 *tmpStride];\
2156
        const int tmp3= tmp[3 *tmpStride];\
2157
        const int tmp4= tmp[4 *tmpStride];\
2158
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2159
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2160
        dst++;\
2161
        tmp++;\
2162
    }\
2163
}\
2164
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2165
    const int h=4;\
2166
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2167
    int i;\
2168
    for(i=0; i<h; i++)\
2169
    {\
2170
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2171
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2172
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2173
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2174
        dst+=dstStride;\
2175
        src+=srcStride;\
2176
    }\
2177
}\
2178
\
2179
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180
    const int w=4;\
2181
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182
    int i;\
2183
    for(i=0; i<w; i++)\
2184
    {\
2185
        const int srcB= src[-2*srcStride];\
2186
        const int srcA= src[-1*srcStride];\
2187
        const int src0= src[0 *srcStride];\
2188
        const int src1= src[1 *srcStride];\
2189
        const int src2= src[2 *srcStride];\
2190
        const int src3= src[3 *srcStride];\
2191
        const int src4= src[4 *srcStride];\
2192
        const int src5= src[5 *srcStride];\
2193
        const int src6= src[6 *srcStride];\
2194
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2195
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2196
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2197
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2198
        dst++;\
2199
        src++;\
2200
    }\
2201
}\
2202
\
2203
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2204
    const int h=4;\
2205
    const int w=4;\
2206
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207
    int i;\
2208
    src -= 2*srcStride;\
2209
    for(i=0; i<h+5; i++)\
2210
    {\
2211
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2212
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2213
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2214
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2215
        tmp+=tmpStride;\
2216
        src+=srcStride;\
2217
    }\
2218
    tmp -= tmpStride*(h+5-2);\
2219
    for(i=0; i<w; i++)\
2220
    {\
2221
        const int tmpB= tmp[-2*tmpStride];\
2222
        const int tmpA= tmp[-1*tmpStride];\
2223
        const int tmp0= tmp[0 *tmpStride];\
2224
        const int tmp1= tmp[1 *tmpStride];\
2225
        const int tmp2= tmp[2 *tmpStride];\
2226
        const int tmp3= tmp[3 *tmpStride];\
2227
        const int tmp4= tmp[4 *tmpStride];\
2228
        const int tmp5= tmp[5 *tmpStride];\
2229
        const int tmp6= tmp[6 *tmpStride];\
2230
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2231
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2232
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2233
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2234
        dst++;\
2235
        tmp++;\
2236
    }\
2237
}\
2238
\
2239
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2240
    const int h=8;\
2241
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242
    int i;\
2243
    for(i=0; i<h; i++)\
2244
    {\
2245
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2246
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2247
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2248
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2249
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2250
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2251
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2252
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2253
        dst+=dstStride;\
2254
        src+=srcStride;\
2255
    }\
2256
}\
2257
\
2258
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2259
    const int w=8;\
2260
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2261
    int i;\
2262
    for(i=0; i<w; i++)\
2263
    {\
2264
        const int srcB= src[-2*srcStride];\
2265
        const int srcA= src[-1*srcStride];\
2266
        const int src0= src[0 *srcStride];\
2267
        const int src1= src[1 *srcStride];\
2268
        const int src2= src[2 *srcStride];\
2269
        const int src3= src[3 *srcStride];\
2270
        const int src4= src[4 *srcStride];\
2271
        const int src5= src[5 *srcStride];\
2272
        const int src6= src[6 *srcStride];\
2273
        const int src7= src[7 *srcStride];\
2274
        const int src8= src[8 *srcStride];\
2275
        const int src9= src[9 *srcStride];\
2276
        const int src10=src[10*srcStride];\
2277
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2278
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2279
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2280
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2281
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2282
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2283
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2284
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2285
        dst++;\
2286
        src++;\
2287
    }\
2288
}\
2289
\
2290
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2291
    const int h=8;\
2292
    const int w=8;\
2293
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294
    int i;\
2295
    src -= 2*srcStride;\
2296
    for(i=0; i<h+5; i++)\
2297
    {\
2298
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2299
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2300
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2301
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2302
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2303
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2304
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2305
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2306
        tmp+=tmpStride;\
2307
        src+=srcStride;\
2308
    }\
2309
    tmp -= tmpStride*(h+5-2);\
2310
    for(i=0; i<w; i++)\
2311
    {\
2312
        const int tmpB= tmp[-2*tmpStride];\
2313
        const int tmpA= tmp[-1*tmpStride];\
2314
        const int tmp0= tmp[0 *tmpStride];\
2315
        const int tmp1= tmp[1 *tmpStride];\
2316
        const int tmp2= tmp[2 *tmpStride];\
2317
        const int tmp3= tmp[3 *tmpStride];\
2318
        const int tmp4= tmp[4 *tmpStride];\
2319
        const int tmp5= tmp[5 *tmpStride];\
2320
        const int tmp6= tmp[6 *tmpStride];\
2321
        const int tmp7= tmp[7 *tmpStride];\
2322
        const int tmp8= tmp[8 *tmpStride];\
2323
        const int tmp9= tmp[9 *tmpStride];\
2324
        const int tmp10=tmp[10*tmpStride];\
2325
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2326
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2327
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2328
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2329
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2330
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2331
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2332
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2333
        dst++;\
2334
        tmp++;\
2335
    }\
2336
}\
2337
\
2338
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2340
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2341
    src += 8*srcStride;\
2342
    dst += 8*dstStride;\
2343
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2344
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2345
}\
2346
\
2347
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2348
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2349
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2350
    src += 8*srcStride;\
2351
    dst += 8*dstStride;\
2352
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2353
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2354
}\
2355
\
2356
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2357
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2358
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2359
    src += 8*srcStride;\
2360
    dst += 8*dstStride;\
2361
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2362
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2363
}\
2364

    
2365
#define H264_MC(OPNAME, SIZE) \
2366
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2367
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2368
}\
2369
\
2370
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2371
    uint8_t half[SIZE*SIZE];\
2372
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2374
}\
2375
\
2376
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2377
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2378
}\
2379
\
2380
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2381
    uint8_t half[SIZE*SIZE];\
2382
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2383
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2384
}\
2385
\
2386
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2387
    uint8_t full[SIZE*(SIZE+5)];\
2388
    uint8_t * const full_mid= full + SIZE*2;\
2389
    uint8_t half[SIZE*SIZE];\
2390
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2391
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2392
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2393
}\
2394
\
2395
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2396
    uint8_t full[SIZE*(SIZE+5)];\
2397
    uint8_t * const full_mid= full + SIZE*2;\
2398
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2399
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2400
}\
2401
\
2402
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2403
    uint8_t full[SIZE*(SIZE+5)];\
2404
    uint8_t * const full_mid= full + SIZE*2;\
2405
    uint8_t half[SIZE*SIZE];\
2406
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2407
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2408
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2409
}\
2410
\
2411
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2412
    uint8_t full[SIZE*(SIZE+5)];\
2413
    uint8_t * const full_mid= full + SIZE*2;\
2414
    uint8_t halfH[SIZE*SIZE];\
2415
    uint8_t halfV[SIZE*SIZE];\
2416
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2417
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2418
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2420
}\
2421
\
2422
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2423
    uint8_t full[SIZE*(SIZE+5)];\
2424
    uint8_t * const full_mid= full + SIZE*2;\
2425
    uint8_t halfH[SIZE*SIZE];\
2426
    uint8_t halfV[SIZE*SIZE];\
2427
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2429
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2431
}\
2432
\
2433
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2434
    uint8_t full[SIZE*(SIZE+5)];\
2435
    uint8_t * const full_mid= full + SIZE*2;\
2436
    uint8_t halfH[SIZE*SIZE];\
2437
    uint8_t halfV[SIZE*SIZE];\
2438
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2440
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2442
}\
2443
\
2444
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2445
    uint8_t full[SIZE*(SIZE+5)];\
2446
    uint8_t * const full_mid= full + SIZE*2;\
2447
    uint8_t halfH[SIZE*SIZE];\
2448
    uint8_t halfV[SIZE*SIZE];\
2449
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2451
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2453
}\
2454
\
2455
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2456
    int16_t tmp[SIZE*(SIZE+5)];\
2457
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2458
}\
2459
\
2460
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2461
    int16_t tmp[SIZE*(SIZE+5)];\
2462
    uint8_t halfH[SIZE*SIZE];\
2463
    uint8_t halfHV[SIZE*SIZE];\
2464
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2466
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2467
}\
2468
\
2469
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2470
    int16_t tmp[SIZE*(SIZE+5)];\
2471
    uint8_t halfH[SIZE*SIZE];\
2472
    uint8_t halfHV[SIZE*SIZE];\
2473
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2474
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2475
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2476
}\
2477
\
2478
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2479
    uint8_t full[SIZE*(SIZE+5)];\
2480
    uint8_t * const full_mid= full + SIZE*2;\
2481
    int16_t tmp[SIZE*(SIZE+5)];\
2482
    uint8_t halfV[SIZE*SIZE];\
2483
    uint8_t halfHV[SIZE*SIZE];\
2484
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2485
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2486
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2487
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2488
}\
2489
\
2490
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2491
    uint8_t full[SIZE*(SIZE+5)];\
2492
    uint8_t * const full_mid= full + SIZE*2;\
2493
    int16_t tmp[SIZE*(SIZE+5)];\
2494
    uint8_t halfV[SIZE*SIZE];\
2495
    uint8_t halfHV[SIZE*SIZE];\
2496
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2497
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2498
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2499
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2500
}\
2501

    
2502
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2503
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2504
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2505
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2506
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2507

    
2508
H264_LOWPASS(put_       , op_put, op2_put)
2509
H264_LOWPASS(avg_       , op_avg, op2_avg)
2510
H264_MC(put_, 2)
2511
H264_MC(put_, 4)
2512
H264_MC(put_, 8)
2513
H264_MC(put_, 16)
2514
H264_MC(avg_, 4)
2515
H264_MC(avg_, 8)
2516
H264_MC(avg_, 16)
2517

    
2518
#undef op_avg
2519
#undef op_put
2520
#undef op2_avg
2521
#undef op2_put
2522
#endif
2523

    
2524
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2525
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2526
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2527
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2528

    
2529
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2530
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2531
    int i;
2532

    
2533
    for(i=0; i<h; i++){
2534
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2535
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2536
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2537
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2538
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2539
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2540
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2541
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2542
        dst+=dstStride;
2543
        src+=srcStride;
2544
    }
2545
}
2546

    
2547
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2548
    put_pixels8_c(dst, src, stride, 8);
2549
}
2550
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2551
    avg_pixels8_c(dst, src, stride, 8);
2552
}
2553
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2554
    put_pixels16_c(dst, src, stride, 16);
2555
}
2556
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2557
    avg_pixels16_c(dst, src, stride, 16);
2558
}
2559

    
2560
#if CONFIG_RV40_DECODER
2561
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2562
    put_pixels16_xy2_c(dst, src, stride, 16);
2563
}
2564
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2565
    avg_pixels16_xy2_c(dst, src, stride, 16);
2566
}
2567
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568
    put_pixels8_xy2_c(dst, src, stride, 8);
2569
}
2570
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571
    avg_pixels8_xy2_c(dst, src, stride, 8);
2572
}
2573
#endif /* CONFIG_RV40_DECODER */
2574

    
2575
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2576
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2577
    int i;
2578

    
2579
    for(i=0; i<w; i++){
2580
        const int src_1= src[ -srcStride];
2581
        const int src0 = src[0          ];
2582
        const int src1 = src[  srcStride];
2583
        const int src2 = src[2*srcStride];
2584
        const int src3 = src[3*srcStride];
2585
        const int src4 = src[4*srcStride];
2586
        const int src5 = src[5*srcStride];
2587
        const int src6 = src[6*srcStride];
2588
        const int src7 = src[7*srcStride];
2589
        const int src8 = src[8*srcStride];
2590
        const int src9 = src[9*srcStride];
2591
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2592
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2593
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2594
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2595
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2596
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2597
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2598
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2599
        src++;
2600
        dst++;
2601
    }
2602
}
2603

    
2604
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2605
    uint8_t half[64];
2606
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2607
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2608
}
2609

    
2610
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2611
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2612
}
2613

    
2614
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2615
    uint8_t half[64];
2616
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2617
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2618
}
2619

    
2620
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2621
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2622
}
2623

    
2624
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2625
    uint8_t halfH[88];
2626
    uint8_t halfV[64];
2627
    uint8_t halfHV[64];
2628
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2629
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2630
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2631
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2632
}
2633
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2634
    uint8_t halfH[88];
2635
    uint8_t halfV[64];
2636
    uint8_t halfHV[64];
2637
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2638
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2639
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2640
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2641
}
2642
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2643
    uint8_t halfH[88];
2644
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2645
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2646
}
2647

    
2648
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2649
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2650
    int x;
2651
    const int strength= ff_h263_loop_filter_strength[qscale];
2652

    
2653
    for(x=0; x<8; x++){
2654
        int d1, d2, ad1;
2655
        int p0= src[x-2*stride];
2656
        int p1= src[x-1*stride];
2657
        int p2= src[x+0*stride];
2658
        int p3= src[x+1*stride];
2659
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2660

    
2661
        if     (d<-2*strength) d1= 0;
2662
        else if(d<-  strength) d1=-2*strength - d;
2663
        else if(d<   strength) d1= d;
2664
        else if(d< 2*strength) d1= 2*strength - d;
2665
        else                   d1= 0;
2666

    
2667
        p1 += d1;
2668
        p2 -= d1;
2669
        if(p1&256) p1= ~(p1>>31);
2670
        if(p2&256) p2= ~(p2>>31);
2671

    
2672
        src[x-1*stride] = p1;
2673
        src[x+0*stride] = p2;
2674

    
2675
        ad1= FFABS(d1)>>1;
2676

    
2677
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2678

    
2679
        src[x-2*stride] = p0 - d2;
2680
        src[x+  stride] = p3 + d2;
2681
    }
2682
    }
2683
}
2684

    
2685
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2686
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2687
    int y;
2688
    const int strength= ff_h263_loop_filter_strength[qscale];
2689

    
2690
    for(y=0; y<8; y++){
2691
        int d1, d2, ad1;
2692
        int p0= src[y*stride-2];
2693
        int p1= src[y*stride-1];
2694
        int p2= src[y*stride+0];
2695
        int p3= src[y*stride+1];
2696
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2697

    
2698
        if     (d<-2*strength) d1= 0;
2699
        else if(d<-  strength) d1=-2*strength - d;
2700
        else if(d<   strength) d1= d;
2701
        else if(d< 2*strength) d1= 2*strength - d;
2702
        else                   d1= 0;
2703

    
2704
        p1 += d1;
2705
        p2 -= d1;
2706
        if(p1&256) p1= ~(p1>>31);
2707
        if(p2&256) p2= ~(p2>>31);
2708

    
2709
        src[y*stride-1] = p1;
2710
        src[y*stride+0] = p2;
2711

    
2712
        ad1= FFABS(d1)>>1;
2713

    
2714
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2715

    
2716
        src[y*stride-2] = p0 - d2;
2717
        src[y*stride+1] = p3 + d2;
2718
    }
2719
    }
2720
}
2721

    
2722
static void h261_loop_filter_c(uint8_t *src, int stride){
2723
    int x,y,xy,yz;
2724
    int temp[64];
2725

    
2726
    for(x=0; x<8; x++){
2727
        temp[x      ] = 4*src[x           ];
2728
        temp[x + 7*8] = 4*src[x + 7*stride];
2729
    }
2730
    for(y=1; y<7; y++){
2731
        for(x=0; x<8; x++){
2732
            xy = y * stride + x;
2733
            yz = y * 8 + x;
2734
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2735
        }
2736
    }
2737

    
2738
    for(y=0; y<8; y++){
2739
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2740
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2741
        for(x=1; x<7; x++){
2742
            xy = y * stride + x;
2743
            yz = y * 8 + x;
2744
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2745
        }
2746
    }
2747
}
2748

    
2749
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2750
{
2751
    int s, i;
2752

    
2753
    s = 0;
2754
    for(i=0;i<h;i++) {
2755
        s += abs(pix1[0] - pix2[0]);
2756
        s += abs(pix1[1] - pix2[1]);
2757
        s += abs(pix1[2] - pix2[2]);
2758
        s += abs(pix1[3] - pix2[3]);
2759
        s += abs(pix1[4] - pix2[4]);
2760
        s += abs(pix1[5] - pix2[5]);
2761
        s += abs(pix1[6] - pix2[6]);
2762
        s += abs(pix1[7] - pix2[7]);
2763
        s += abs(pix1[8] - pix2[8]);
2764
        s += abs(pix1[9] - pix2[9]);
2765
        s += abs(pix1[10] - pix2[10]);
2766
        s += abs(pix1[11] - pix2[11]);
2767
        s += abs(pix1[12] - pix2[12]);
2768
        s += abs(pix1[13] - pix2[13]);
2769
        s += abs(pix1[14] - pix2[14]);
2770
        s += abs(pix1[15] - pix2[15]);
2771
        pix1 += line_size;
2772
        pix2 += line_size;
2773
    }
2774
    return s;
2775
}
2776

    
2777
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2778
{
2779
    int s, i;
2780

    
2781
    s = 0;
2782
    for(i=0;i<h;i++) {
2783
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2784
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2785
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2786
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2787
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2788
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2789
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2790
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2791
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2792
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2793
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2794
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2795
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2796
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2797
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2798
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2799
        pix1 += line_size;
2800
        pix2 += line_size;
2801
    }
2802
    return s;
2803
}
2804

    
2805
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2806
{
2807
    int s, i;
2808
    uint8_t *pix3 = pix2 + line_size;
2809

    
2810
    s = 0;
2811
    for(i=0;i<h;i++) {
2812
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2813
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2814
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2815
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2816
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2817
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2818
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2819
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2820
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2821
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2822
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2823
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2824
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2825
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2826
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2827
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2828
        pix1 += line_size;
2829
        pix2 += line_size;
2830
        pix3 += line_size;
2831
    }
2832
    return s;
2833
}
2834

    
2835
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2836
{
2837
    int s, i;
2838
    uint8_t *pix3 = pix2 + line_size;
2839

    
2840
    s = 0;
2841
    for(i=0;i<h;i++) {
2842
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2843
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2844
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2845
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2846
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2847
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2848
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2849
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2850
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2851
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2852
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2853
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2854
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2855
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2856
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2857
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2858
        pix1 += line_size;
2859
        pix2 += line_size;
2860
        pix3 += line_size;
2861
    }
2862
    return s;
2863
}
2864

    
2865
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866
{
2867
    int s, i;
2868

    
2869
    s = 0;
2870
    for(i=0;i<h;i++) {
2871
        s += abs(pix1[0] - pix2[0]);
2872
        s += abs(pix1[1] - pix2[1]);
2873
        s += abs(pix1[2] - pix2[2]);
2874
        s += abs(pix1[3] - pix2[3]);
2875
        s += abs(pix1[4] - pix2[4]);
2876
        s += abs(pix1[5] - pix2[5]);
2877
        s += abs(pix1[6] - pix2[6]);
2878
        s += abs(pix1[7] - pix2[7]);
2879
        pix1 += line_size;
2880
        pix2 += line_size;
2881
    }
2882
    return s;
2883
}
2884

    
2885
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2886
{
2887
    int s, i;
2888

    
2889
    s = 0;
2890
    for(i=0;i<h;i++) {
2891
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2892
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2893
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2894
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2895
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2896
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2897
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2898
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2899
        pix1 += line_size;
2900
        pix2 += line_size;
2901
    }
2902
    return s;
2903
}
2904

    
2905
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2906
{
2907
    int s, i;
2908
    uint8_t *pix3 = pix2 + line_size;
2909

    
2910
    s = 0;
2911
    for(i=0;i<h;i++) {
2912
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2913
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2914
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2915
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2916
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2917
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2918
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2919
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2920
        pix1 += line_size;
2921
        pix2 += line_size;
2922
        pix3 += line_size;
2923
    }
2924
    return s;
2925
}
2926

    
2927
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928
{
2929
    int s, i;
2930
    uint8_t *pix3 = pix2 + line_size;
2931

    
2932
    s = 0;
2933
    for(i=0;i<h;i++) {
2934
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2935
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2936
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2937
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2938
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2939
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2940
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2941
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2942
        pix1 += line_size;
2943
        pix2 += line_size;
2944
        pix3 += line_size;
2945
    }
2946
    return s;
2947
}
2948

    
2949
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2950
    MpegEncContext *c = v;
2951
    int score1=0;
2952
    int score2=0;
2953
    int x,y;
2954

    
2955
    for(y=0; y<h; y++){
2956
        for(x=0; x<16; x++){
2957
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2958
        }
2959
        if(y+1<h){
2960
            for(x=0; x<15; x++){
2961
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2962
                             - s1[x+1] + s1[x+1+stride])
2963
                        -FFABS(  s2[x  ] - s2[x  +stride]
2964
                             - s2[x+1] + s2[x+1+stride]);
2965
            }
2966
        }
2967
        s1+= stride;
2968
        s2+= stride;
2969
    }
2970

    
2971
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2972
    else  return score1 + FFABS(score2)*8;
2973
}
2974

    
2975
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2976
    MpegEncContext *c = v;
2977
    int score1=0;
2978
    int score2=0;
2979
    int x,y;
2980

    
2981
    for(y=0; y<h; y++){
2982
        for(x=0; x<8; x++){
2983
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2984
        }
2985
        if(y+1<h){
2986
            for(x=0; x<7; x++){
2987
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2988
                             - s1[x+1] + s1[x+1+stride])
2989
                        -FFABS(  s2[x  ] - s2[x  +stride]
2990
                             - s2[x+1] + s2[x+1+stride]);
2991
            }
2992
        }
2993
        s1+= stride;
2994
        s2+= stride;
2995
    }
2996

    
2997
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2998
    else  return score1 + FFABS(score2)*8;
2999
}
3000

    
3001
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3002
    int i;
3003
    unsigned int sum=0;
3004

    
3005
    for(i=0; i<8*8; i++){
3006
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3007
        int w= weight[i];
3008
        b>>= RECON_SHIFT;
3009
        assert(-512<b && b<512);
3010

    
3011
        sum += (w*b)*(w*b)>>4;
3012
    }
3013
    return sum>>2;
3014
}
3015

    
3016
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3017
    int i;
3018

    
3019
    for(i=0; i<8*8; i++){
3020
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3021
    }
3022
}
3023

    
3024
/**
3025
 * permutes an 8x8 block.
3026
 * @param block the block which will be permuted according to the given permutation vector
3027
 * @param permutation the permutation vector
3028
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3029
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3030
 *                  (inverse) permutated to scantable order!
3031
 */
3032
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3033
{
3034
    int i;
3035
    DCTELEM temp[64];
3036

    
3037
    if(last<=0) return;
3038
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3039

    
3040
    for(i=0; i<=last; i++){
3041
        const int j= scantable[i];
3042
        temp[j]= block[j];
3043
        block[j]=0;
3044
    }
3045

    
3046
    for(i=0; i<=last; i++){
3047
        const int j= scantable[i];
3048
        const int perm_j= permutation[j];
3049
        block[perm_j]= temp[j];
3050
    }
3051
}
3052

    
3053
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3054
    return 0;
3055
}
3056

    
3057
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3058
    int i;
3059

    
3060
    memset(cmp, 0, sizeof(void*)*6);
3061

    
3062
    for(i=0; i<6; i++){
3063
        switch(type&0xFF){
3064
        case FF_CMP_SAD:
3065
            cmp[i]= c->sad[i];
3066
            break;
3067
        case FF_CMP_SATD:
3068
            cmp[i]= c->hadamard8_diff[i];
3069
            break;
3070
        case FF_CMP_SSE:
3071
            cmp[i]= c->sse[i];
3072
            break;
3073
        case FF_CMP_DCT:
3074
            cmp[i]= c->dct_sad[i];
3075
            break;
3076
        case FF_CMP_DCT264:
3077
            cmp[i]= c->dct264_sad[i];
3078
            break;
3079
        case FF_CMP_DCTMAX:
3080
            cmp[i]= c->dct_max[i];
3081
            break;
3082
        case FF_CMP_PSNR:
3083
            cmp[i]= c->quant_psnr[i];
3084
            break;
3085
        case FF_CMP_BIT:
3086
            cmp[i]= c->bit[i];
3087
            break;
3088
        case FF_CMP_RD:
3089
            cmp[i]= c->rd[i];
3090
            break;
3091
        case FF_CMP_VSAD:
3092
            cmp[i]= c->vsad[i];
3093
            break;
3094
        case FF_CMP_VSSE:
3095
            cmp[i]= c->vsse[i];
3096
            break;
3097
        case FF_CMP_ZERO:
3098
            cmp[i]= zero_cmp;
3099
            break;
3100
        case FF_CMP_NSSE:
3101
            cmp[i]= c->nsse[i];
3102
            break;
3103
#if CONFIG_DWT
3104
        case FF_CMP_W53:
3105
            cmp[i]= c->w53[i];
3106
            break;
3107
        case FF_CMP_W97:
3108
            cmp[i]= c->w97[i];
3109
            break;
3110
#endif
3111
        default:
3112
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3113
        }
3114
    }
3115
}
3116

    
3117
static void clear_block_c(DCTELEM *block)
3118
{
3119
    memset(block, 0, sizeof(DCTELEM)*64);
3120
}
3121

    
3122
/**
3123
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3124
 */
3125
static void clear_blocks_c(DCTELEM *blocks)
3126
{
3127
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3128
}
3129

    
3130
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3131
    long i;
3132
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3133
        long a = *(long*)(src+i);
3134
        long b = *(long*)(dst+i);
3135
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3136
    }
3137
    for(; i<w; i++)
3138
        dst[i+0] += src[i+0];
3139
}
3140

    
3141
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3142
    long i;
3143
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3144
        long a = *(long*)(src1+i);
3145
        long b = *(long*)(src2+i);
3146
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3147
    }
3148
    for(; i<w; i++)
3149
        dst[i] = src1[i]+src2[i];
3150
}
3151

    
3152
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3153
    long i;
3154
#if !HAVE_FAST_UNALIGNED
3155
    if((long)src2 & (sizeof(long)-1)){
3156
        for(i=0; i+7<w; i+=8){
3157
            dst[i+0] = src1[i+0]-src2[i+0];
3158
            dst[i+1] = src1[i+1]-src2[i+1];
3159
            dst[i+2] = src1[i+2]-src2[i+2];
3160
            dst[i+3] = src1[i+3]-src2[i+3];
3161
            dst[i+4] = src1[i+4]-src2[i+4];
3162
            dst[i+5] = src1[i+5]-src2[i+5];
3163
            dst[i+6] = src1[i+6]-src2[i+6];
3164
            dst[i+7] = src1[i+7]-src2[i+7];
3165
        }
3166
    }else
3167
#endif
3168
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3169
        long a = *(long*)(src1+i);
3170
        long b = *(long*)(src2+i);
3171
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3172
    }
3173
    for(; i<w; i++)
3174
        dst[i+0] = src1[i+0]-src2[i+0];
3175
}
3176

    
3177
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3178
    int i;
3179
    uint8_t l, lt;
3180

    
3181
    l= *left;
3182
    lt= *left_top;
3183

    
3184
    for(i=0; i<w; i++){
3185
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3186
        lt= src1[i];
3187
        dst[i]= l;
3188
    }
3189

    
3190
    *left= l;
3191
    *left_top= lt;
3192
}
3193

    
3194
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3195
    int i;
3196
    uint8_t l, lt;
3197

    
3198
    l= *left;
3199
    lt= *left_top;
3200

    
3201
    for(i=0; i<w; i++){
3202
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3203
        lt= src1[i];
3204
        l= src2[i];
3205
        dst[i]= l - pred;
3206
    }
3207

    
3208
    *left= l;
3209
    *left_top= lt;
3210
}
3211

    
3212
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3213
    int i;
3214

    
3215
    for(i=0; i<w-1; i++){
3216
        acc+= src[i];
3217
        dst[i]= acc;
3218
        i++;
3219
        acc+= src[i];
3220
        dst[i]= acc;
3221
    }
3222

    
3223
    for(; i<w; i++){
3224
        acc+= src[i];
3225
        dst[i]= acc;
3226
    }
3227

    
3228
    return acc;
3229
}
3230

    
3231
#if HAVE_BIGENDIAN
3232
#define B 3
3233
#define G 2
3234
#define R 1
3235
#define A 0
3236
#else
3237
#define B 0
3238
#define G 1
3239
#define R 2
3240
#define A 3
3241
#endif
3242
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3243
    int i;
3244
    int r,g,b,a;
3245
    r= *red;
3246
    g= *green;
3247
    b= *blue;
3248
    a= *alpha;
3249

    
3250
    for(i=0; i<w; i++){
3251
        b+= src[4*i+B];
3252
        g+= src[4*i+G];
3253
        r+= src[4*i+R];
3254
        a+= src[4*i+A];
3255

    
3256
        dst[4*i+B]= b;
3257
        dst[4*i+G]= g;
3258
        dst[4*i+R]= r;
3259
        dst[4*i+A]= a;
3260
    }
3261

    
3262
    *red= r;
3263
    *green= g;
3264
    *blue= b;
3265
    *alpha= a;
3266
}
3267
#undef B
3268
#undef G
3269
#undef R
3270
#undef A
3271

    
3272
#define BUTTERFLY2(o1,o2,i1,i2) \
3273
o1= (i1)+(i2);\
3274
o2= (i1)-(i2);
3275

    
3276
#define BUTTERFLY1(x,y) \
3277
{\
3278
    int a,b;\
3279
    a= x;\
3280
    b= y;\
3281
    x= a+b;\
3282
    y= a-b;\
3283
}
3284

    
3285
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3286

    
3287
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3288
    int i;
3289
    int temp[64];
3290
    int sum=0;
3291

    
3292
    assert(h==8);
3293

    
3294
    for(i=0; i<8; i++){
3295
        //FIXME try pointer walks
3296
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3297
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3298
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3299
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3300

    
3301
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3302
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3303
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3304
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3305

    
3306
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3307
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3308
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3309
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3310
    }
3311

    
3312
    for(i=0; i<8; i++){
3313
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3314
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3315
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3316
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3317

    
3318
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3319
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3320
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3321
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3322

    
3323
        sum +=
3324
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3325
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3326
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3327
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3328
    }
3329
#if 0
3330
static int maxi=0;
3331
if(sum>maxi){
3332
    maxi=sum;
3333
    printf("MAX:%d\n", maxi);
3334
}
3335
#endif
3336
    return sum;
3337
}
3338

    
3339
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3340
    int i;
3341
    int temp[64];
3342
    int sum=0;
3343

    
3344
    assert(h==8);
3345

    
3346
    for(i=0; i<8; i++){
3347
        //FIXME try pointer walks
3348
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3349
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3350
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3351
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3352

    
3353
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3354
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3355
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3356
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3357

    
3358
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3359
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3360
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3361
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3362
    }
3363

    
3364
    for(i=0; i<8; i++){
3365
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3366
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3367
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3368
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3369

    
3370
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3371
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3372
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3373
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3374

    
3375
        sum +=
3376
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3377
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3378
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3379
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3380
    }
3381

    
3382
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3383

    
3384
    return sum;
3385
}
3386

    
3387
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3388
    MpegEncContext * const s= (MpegEncContext *)c;
3389
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3390

    
3391
    assert(h==8);
3392

    
3393
    s->dsp.diff_pixels(temp, src1, src2, stride);
3394
    s->dsp.fdct(temp);
3395
    return s->dsp.sum_abs_dctelem(temp);
3396
}
3397

    
3398
#if CONFIG_GPL
3399
#define DCT8_1D {\
3400
    const int s07 = SRC(0) + SRC(7);\
3401
    const int s16 = SRC(1) + SRC(6);\
3402
    const int s25 = SRC(2) + SRC(5);\
3403
    const int s34 = SRC(3) + SRC(4);\
3404
    const int a0 = s07 + s34;\
3405
    const int a1 = s16 + s25;\
3406
    const int a2 = s07 - s34;\
3407
    const int a3 = s16 - s25;\
3408
    const int d07 = SRC(0) - SRC(7);\
3409
    const int d16 = SRC(1) - SRC(6);\
3410
    const int d25 = SRC(2) - SRC(5);\
3411
    const int d34 = SRC(3) - SRC(4);\
3412
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3413
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3414
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3415
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3416
    DST(0,  a0 + a1     ) ;\
3417
    DST(1,  a4 + (a7>>2)) ;\
3418
    DST(2,  a2 + (a3>>1)) ;\
3419
    DST(3,  a5 + (a6>>2)) ;\
3420
    DST(4,  a0 - a1     ) ;\
3421
    DST(5,  a6 - (a5>>2)) ;\
3422
    DST(6, (a2>>1) - a3 ) ;\
3423
    DST(7, (a4>>2) - a7 ) ;\
3424
}
3425

    
3426
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3427
    MpegEncContext * const s= (MpegEncContext *)c;
3428
    DCTELEM dct[8][8];
3429
    int i;
3430
    int sum=0;
3431

    
3432
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3433

    
3434
#define SRC(x) dct[i][x]
3435
#define DST(x,v) dct[i][x]= v
3436
    for( i = 0; i < 8; i++ )
3437
        DCT8_1D
3438
#undef SRC
3439
#undef DST
3440

    
3441
#define SRC(x) dct[x][i]
3442
#define DST(x,v) sum += FFABS(v)
3443
    for( i = 0; i < 8; i++ )
3444
        DCT8_1D
3445
#undef SRC
3446
#undef DST
3447
    return sum;
3448
}
3449
#endif
3450

    
3451
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3452
    MpegEncContext * const s= (MpegEncContext *)c;
3453
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3454
    int sum=0, i;
3455

    
3456
    assert(h==8);
3457

    
3458
    s->dsp.diff_pixels(temp, src1, src2, stride);
3459
    s->dsp.fdct(temp);
3460

    
3461
    for(i=0; i<64; i++)
3462
        sum= FFMAX(sum, FFABS(temp[i]));
3463

    
3464
    return sum;
3465
}
3466

    
3467
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468
    MpegEncContext * const s= (MpegEncContext *)c;
3469
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3470
    DCTELEM * const bak = temp+64;
3471
    int sum=0, i;
3472

    
3473
    assert(h==8);
3474
    s->mb_intra=0;
3475

    
3476
    s->dsp.diff_pixels(temp, src1, src2, stride);
3477

    
3478
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3479

    
3480
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3481
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3482
    ff_simple_idct(temp); //FIXME
3483

    
3484
    for(i=0; i<64; i++)
3485
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3486

    
3487
    return sum;
3488
}
3489

    
3490
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3491
    MpegEncContext * const s= (MpegEncContext *)c;
3492
    const uint8_t *scantable= s->intra_scantable.permutated;
3493
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3494
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3495
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3496
    int i, last, run, bits, level, distortion, start_i;
3497
    const int esc_length= s->ac_esc_length;
3498
    uint8_t * length;
3499
    uint8_t * last_length;
3500

    
3501
    assert(h==8);
3502

    
3503
    copy_block8(lsrc1, src1, 8, stride, 8);
3504
    copy_block8(lsrc2, src2, 8, stride, 8);
3505

    
3506
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3507

    
3508
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3509

    
3510
    bits=0;
3511

    
3512
    if (s->mb_intra) {
3513
        start_i = 1;
3514
        length     = s->intra_ac_vlc_length;
3515
        last_length= s->intra_ac_vlc_last_length;
3516
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3517
    } else {
3518
        start_i = 0;
3519
        length     = s->inter_ac_vlc_length;
3520
        last_length= s->inter_ac_vlc_last_length;
3521
    }
3522

    
3523
    if(last>=start_i){
3524
        run=0;
3525
        for(i=start_i; i<last; i++){
3526
            int j= scantable[i];
3527
            level= temp[j];
3528

    
3529
            if(level){
3530
                level+=64;
3531
                if((level&(~127)) == 0){
3532
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3533
                }else
3534
                    bits+= esc_length;
3535
                run=0;
3536
            }else
3537
                run++;
3538
        }
3539
        i= scantable[last];
3540

    
3541
        level= temp[i] + 64;
3542

    
3543
        assert(level - 64);
3544

    
3545
        if((level&(~127)) == 0){
3546
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3547
        }else
3548
            bits+= esc_length;
3549

    
3550
    }
3551

    
3552
    if(last>=0){
3553
        if(s->mb_intra)
3554
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3555
        else
3556
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3557
    }
3558

    
3559
    s->dsp.idct_add(lsrc2, 8, temp);
3560

    
3561
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3562

    
3563
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3564
}
3565

    
3566
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3567
    MpegEncContext * const s= (MpegEncContext *)c;
3568
    const uint8_t *scantable= s->intra_scantable.permutated;
3569
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3570
    int i, last, run, bits, level, start_i;
3571
    const int esc_length= s->ac_esc_length;
3572
    uint8_t * length;
3573
    uint8_t * last_length;
3574

    
3575
    assert(h==8);
3576

    
3577
    s->dsp.diff_pixels(temp, src1, src2, stride);
3578

    
3579
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3580

    
3581
    bits=0;
3582

    
3583
    if (s->mb_intra) {
3584
        start_i = 1;
3585
        length     = s->intra_ac_vlc_length;
3586
        last_length= s->intra_ac_vlc_last_length;
3587
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3588
    } else {
3589
        start_i = 0;
3590
        length     = s->inter_ac_vlc_length;
3591
        last_length= s->inter_ac_vlc_last_length;
3592
    }
3593

    
3594
    if(last>=start_i){
3595
        run=0;
3596
        for(i=start_i; i<last; i++){
3597
            int j= scantable[i];
3598
            level= temp[j];
3599

    
3600
            if(level){
3601
                level+=64;
3602
                if((level&(~127)) == 0){
3603
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3604
                }else
3605
                    bits+= esc_length;
3606
                run=0;
3607
            }else
3608
                run++;
3609
        }
3610
        i= scantable[last];
3611

    
3612
        level= temp[i] + 64;
3613

    
3614
        assert(level - 64);
3615

    
3616
        if((level&(~127)) == 0){
3617
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3618
        }else
3619
            bits+= esc_length;
3620
    }
3621

    
3622
    return bits;
3623
}
3624

    
3625
#define VSAD_INTRA(size) \
3626
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3627
    int score=0;                                                                                            \
3628
    int x,y;                                                                                                \
3629
                                                                                                            \
3630
    for(y=1; y<h; y++){                                                                                     \
3631
        for(x=0; x<size; x+=4){                                                                             \
3632
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3633
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3634
        }                                                                                                   \
3635
        s+= stride;                                                                                         \
3636
    }                                                                                                       \
3637
                                                                                                            \
3638
    return score;                                                                                           \
3639
}
3640
VSAD_INTRA(8)
3641
VSAD_INTRA(16)
3642

    
3643
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3644
    int score=0;
3645
    int x,y;
3646

    
3647
    for(y=1; y<h; y++){
3648
        for(x=0; x<16; x++){
3649
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3650
        }
3651
        s1+= stride;
3652
        s2+= stride;
3653
    }
3654

    
3655
    return score;
3656
}
3657

    
3658
#define SQ(a) ((a)*(a))
3659
#define VSSE_INTRA(size) \
3660
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3661
    int score=0;                                                                                            \
3662
    int x,y;                                                                                                \
3663
                                                                                                            \
3664
    for(y=1; y<h; y++){                                                                                     \
3665
        for(x=0; x<size; x+=4){                                                                               \
3666
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3667
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3668
        }                                                                                                   \
3669
        s+= stride;                                                                                         \
3670
    }                                                                                                       \
3671
                                                                                                            \
3672
    return score;                                                                                           \
3673
}
3674
VSSE_INTRA(8)
3675
VSSE_INTRA(16)
3676

    
3677
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3678
    int score=0;
3679
    int x,y;
3680

    
3681
    for(y=1; y<h; y++){
3682
        for(x=0; x<16; x++){
3683
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3684
        }
3685
        s1+= stride;
3686
        s2+= stride;
3687
    }
3688

    
3689
    return score;
3690
}
3691

    
3692
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3693
                               int size){
3694
    int score=0;
3695
    int i;
3696
    for(i=0; i<size; i++)
3697
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3698
    return score;
3699
}
3700

    
3701
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3702
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3703
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3704
#if CONFIG_GPL
3705
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3706
#endif
3707
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3708
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3709
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3710
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3711

    
3712
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3713
    int i;
3714
    for(i=0; i<len; i++)
3715
        dst[i] = src0[i] * src1[i];
3716
}
3717

    
3718
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3719
    int i;
3720
    src1 += len-1;
3721
    for(i=0; i<len; i++)
3722
        dst[i] = src0[i] * src1[-i];
3723
}
3724

    
3725
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3726
    int i;
3727
    for(i=0; i<len; i++)
3728
        dst[i] = src0[i] * src1[i] + src2[i];
3729
}
3730

    
3731
static void vector_fmul_window_c(float *dst, const float *src0,
3732
                                 const float *src1, const float *win, int len)
3733
{
3734
    int i,j;
3735
    dst += len;
3736
    win += len;
3737
    src0+= len;
3738
    for(i=-len, j=len-1; i<0; i++, j--) {
3739
        float s0 = src0[i];
3740
        float s1 = src1[j];
3741
        float wi = win[i];
3742
        float wj = win[j];
3743
        dst[i] = s0*wj - s1*wi;
3744
        dst[j] = s0*wi + s1*wj;
3745
    }
3746
}
3747

    
3748
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3749
                                 int len)
3750
{
3751
    int i;
3752
    for (i = 0; i < len; i++)
3753
        dst[i] = src[i] * mul;
3754
}
3755

    
3756
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3757
                                      const float **sv, float mul, int len)
3758
{
3759
    int i;
3760
    for (i = 0; i < len; i += 2, sv++) {
3761
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3762
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3763
    }
3764
}
3765

    
3766
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3767
                                      const float **sv, float mul, int len)
3768
{
3769
    int i;
3770
    for (i = 0; i < len; i += 4, sv++) {
3771
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3772
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3773
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3774
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3775
    }
3776
}
3777

    
3778
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3779
                               int len)
3780
{
3781
    int i;
3782
    for (i = 0; i < len; i += 2, sv++) {
3783
        dst[i  ] = sv[0][0] * mul;
3784
        dst[i+1] = sv[0][1] * mul;
3785
    }
3786
}
3787

    
3788
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3789
                               int len)
3790
{
3791
    int i;
3792
    for (i = 0; i < len; i += 4, sv++) {
3793
        dst[i  ] = sv[0][0] * mul;
3794
        dst[i+1] = sv[0][1] * mul;
3795
        dst[i+2] = sv[0][2] * mul;
3796
        dst[i+3] = sv[0][3] * mul;
3797
    }
3798
}
3799

    
3800
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3801
                                int len)
3802
{
3803
    int i;
3804
    for (i = 0; i < len; i++) {
3805
        float t = v1[i] - v2[i];
3806
        v1[i] += v2[i];
3807
        v2[i] = t;
3808
    }
3809
}
3810

    
3811
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3812
{
3813
    float p = 0.0;
3814
    int i;
3815

    
3816
    for (i = 0; i < len; i++)
3817
        p += v1[i] * v2[i];
3818

    
3819
    return p;
3820
}
3821

    
3822
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3823
                   uint32_t maxi, uint32_t maxisign)
3824
{
3825

    
3826
    if(a > mini) return mini;
3827
    else if((a^(1<<31)) > maxisign) return maxi;
3828
    else return a;
3829
}
3830

    
3831
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3832
    int i;
3833
    uint32_t mini = *(uint32_t*)min;
3834
    uint32_t maxi = *(uint32_t*)max;
3835
    uint32_t maxisign = maxi ^ (1<<31);
3836
    uint32_t *dsti = (uint32_t*)dst;
3837
    const uint32_t *srci = (const uint32_t*)src;
3838
    for(i=0; i<len; i+=8) {
3839
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3840
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3841
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3842
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3843
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3844
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3845
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3846
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3847
    }
3848
}
3849
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3850
    int i;
3851
    if(min < 0 && max > 0) {
3852
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3853
    } else {
3854
        for(i=0; i < len; i+=8) {
3855
            dst[i    ] = av_clipf(src[i    ], min, max);
3856
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3857
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3858
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3859
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3860
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3861
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3862
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3863
        }
3864
    }
3865
}
3866

    
3867
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3868
{
3869
    int res = 0;
3870

    
3871
    while (order--)
3872
        res += (*v1++ * *v2++) >> shift;
3873

    
3874
    return res;
3875
}
3876

    
3877
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3878
{
3879
    int res = 0;
3880
    while (order--) {
3881
        res   += *v1 * *v2++;
3882
        *v1++ += mul * *v3++;
3883
    }
3884
    return res;
3885
}
3886

    
3887
#define W0 2048
3888
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3889
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3890
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3891
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3892
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3893
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3894
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3895

    
3896
static void wmv2_idct_row(short * b)
3897
{
3898
    int s1,s2;
3899
    int a0,a1,a2,a3,a4,a5,a6,a7;
3900
    /*step 1*/
3901
    a1 = W1*b[1]+W7*b[7];
3902
    a7 = W7*b[1]-W1*b[7];
3903
    a5 = W5*b[5]+W3*b[3];
3904
    a3 = W3*b[5]-W5*b[3];
3905
    a2 = W2*b[2]+W6*b[6];
3906
    a6 = W6*b[2]-W2*b[6];
3907
    a0 = W0*b[0]+W0*b[4];
3908
    a4 = W0*b[0]-W0*b[4];
3909
    /*step 2*/
3910
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3911
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3912
    /*step 3*/
3913
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3914
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3915
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3916
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3917
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3918
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3919
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3920
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3921
}
3922
static void wmv2_idct_col(short * b)
3923
{
3924
    int s1,s2;
3925
    int a0,a1,a2,a3,a4,a5,a6,a7;
3926
    /*step 1, with extended precision*/
3927
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3928
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3929
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3930
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3931
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3932
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3933
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3934
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3935
    /*step 2*/
3936
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3937
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3938
    /*step 3*/
3939
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3940
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3941
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3942
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3943

    
3944
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3945
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3946
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3947
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3948
}
3949
void ff_wmv2_idct_c(short * block){
3950
    int i;
3951

    
3952
    for(i=0;i<64;i+=8){
3953
        wmv2_idct_row(block+i);
3954
    }
3955
    for(i=0;i<8;i++){
3956
        wmv2_idct_col(block+i);
3957
    }
3958
}
3959
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3960
 converted */
3961
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3962
{
3963
    ff_wmv2_idct_c(block);
3964
    put_pixels_clamped_c(block, dest, line_size);
3965
}
3966
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3967
{
3968
    ff_wmv2_idct_c(block);
3969
    add_pixels_clamped_c(block, dest, line_size);
3970
}
3971
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3972
{
3973
    j_rev_dct (block);
3974
    put_pixels_clamped_c(block, dest, line_size);
3975
}
3976
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3977
{
3978
    j_rev_dct (block);
3979
    add_pixels_clamped_c(block, dest, line_size);
3980
}
3981

    
3982
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3983
{
3984
    j_rev_dct4 (block);
3985
    put_pixels_clamped4_c(block, dest, line_size);
3986
}
3987
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3988
{
3989
    j_rev_dct4 (block);
3990
    add_pixels_clamped4_c(block, dest, line_size);
3991
}
3992

    
3993
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3994
{
3995
    j_rev_dct2 (block);
3996
    put_pixels_clamped2_c(block, dest, line_size);
3997
}
3998
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3999
{
4000
    j_rev_dct2 (block);
4001
    add_pixels_clamped2_c(block, dest, line_size);
4002
}
4003

    
4004
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4005
{
4006
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4007

    
4008
    dest[0] = cm[(block[0] + 4)>>3];
4009
}
4010
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4011
{
4012
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4013

    
4014
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4015
}
4016

    
4017
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4018

    
4019
/* init static data */
4020
av_cold void dsputil_static_init(void)
4021
{
4022
    int i;
4023

    
4024
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4025
    for(i=0;i<MAX_NEG_CROP;i++) {
4026
        ff_cropTbl[i] = 0;
4027
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4028
    }
4029

    
4030
    for(i=0;i<512;i++) {
4031
        ff_squareTbl[i] = (i - 256) * (i - 256);
4032
    }
4033

    
4034
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4035
}
4036

    
4037
int ff_check_alignment(void){
4038
    static int did_fail=0;
4039
    DECLARE_ALIGNED(16, int, aligned);
4040

    
4041
    if((intptr_t)&aligned & 15){
4042
        if(!did_fail){
4043
#if HAVE_MMX || HAVE_ALTIVEC
4044
            av_log(NULL, AV_LOG_ERROR,
4045
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4046
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4047
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4048
                "Do not report crashes to FFmpeg developers.\n");
4049
#endif
4050
            did_fail=1;
4051
        }
4052
        return -1;
4053
    }
4054
    return 0;
4055
}
4056

    
4057
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4058
{
4059
    int i;
4060

    
4061
    ff_check_alignment();
4062

    
4063
#if CONFIG_ENCODERS
4064
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4065
        c->fdct = fdct_ifast;
4066
        c->fdct248 = fdct_ifast248;
4067
    }
4068
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4069
        c->fdct = ff_faandct;
4070
        c->fdct248 = ff_faandct248;
4071
    }
4072
    else {
4073
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4074
        c->fdct248 = ff_fdct248_islow;
4075
    }
4076
#endif //CONFIG_ENCODERS
4077

    
4078
    if(avctx->lowres==1){
4079
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4080
            c->idct_put= ff_jref_idct4_put;
4081
            c->idct_add= ff_jref_idct4_add;
4082
        }else{
4083
            c->idct_put= ff_h264_lowres_idct_put_c;
4084
            c->idct_add= ff_h264_lowres_idct_add_c;
4085
        }
4086
        c->idct    = j_rev_dct4;
4087
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4088
    }else if(avctx->lowres==2){
4089
        c->idct_put= ff_jref_idct2_put;
4090
        c->idct_add= ff_jref_idct2_add;
4091
        c->idct    = j_rev_dct2;
4092
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4093
    }else if(avctx->lowres==3){
4094
        c->idct_put= ff_jref_idct1_put;
4095
        c->idct_add= ff_jref_idct1_add;
4096
        c->idct    = j_rev_dct1;
4097
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4098
    }else{
4099
        if(avctx->idct_algo==FF_IDCT_INT){
4100
            c->idct_put= ff_jref_idct_put;
4101
            c->idct_add= ff_jref_idct_add;
4102
            c->idct    = j_rev_dct;
4103
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4104
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4105
                avctx->idct_algo==FF_IDCT_VP3){
4106
            c->idct_put= ff_vp3_idct_put_c;
4107
            c->idct_add= ff_vp3_idct_add_c;
4108
            c->idct    = ff_vp3_idct_c;
4109
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4110
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4111
            c->idct_put= ff_wmv2_idct_put_c;
4112
            c->idct_add= ff_wmv2_idct_add_c;
4113
            c->idct    = ff_wmv2_idct_c;
4114
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4115
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4116
            c->idct_put= ff_faanidct_put;
4117
            c->idct_add= ff_faanidct_add;
4118
            c->idct    = ff_faanidct;
4119
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4120
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4121
            c->idct_put= ff_ea_idct_put_c;
4122
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4123
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4124
            c->idct     = ff_bink_idct_c;
4125
            c->idct_add = ff_bink_idct_add_c;
4126
            c->idct_put = ff_bink_idct_put_c;
4127
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4128
        }else{ //accurate/default
4129
            c->idct_put= ff_simple_idct_put;
4130
            c->idct_add= ff_simple_idct_add;
4131
            c->idct    = ff_simple_idct;
4132
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4133
        }
4134
    }
4135

    
4136
    c->get_pixels = get_pixels_c;
4137
    c->diff_pixels = diff_pixels_c;
4138
    c->put_pixels_clamped = put_pixels_clamped_c;
4139
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4140
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4141
    c->add_pixels_clamped = add_pixels_clamped_c;
4142
    c->add_pixels8 = add_pixels8_c;
4143
    c->add_pixels4 = add_pixels4_c;
4144
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4145
    c->emulated_edge_mc = ff_emulated_edge_mc;
4146
    c->gmc1 = gmc1_c;
4147
    c->gmc = ff_gmc_c;
4148
    c->clear_block = clear_block_c;
4149
    c->clear_blocks = clear_blocks_c;
4150
    c->pix_sum = pix_sum_c;
4151
    c->pix_norm1 = pix_norm1_c;
4152

    
4153
    c->fill_block_tab[0] = fill_block16_c;
4154
    c->fill_block_tab[1] = fill_block8_c;
4155
    c->scale_block = scale_block_c;
4156

    
4157
    /* TODO [0] 16  [1] 8 */
4158
    c->pix_abs[0][0] = pix_abs16_c;
4159
    c->pix_abs[0][1] = pix_abs16_x2_c;
4160
    c->pix_abs[0][2] = pix_abs16_y2_c;
4161
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4162
    c->pix_abs[1][0] = pix_abs8_c;
4163
    c->pix_abs[1][1] = pix_abs8_x2_c;
4164
    c->pix_abs[1][2] = pix_abs8_y2_c;
4165
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4166

    
4167
#define dspfunc(PFX, IDX, NUM) \
4168
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4169
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4170
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4171
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4172

    
4173
    dspfunc(put, 0, 16);
4174
    dspfunc(put_no_rnd, 0, 16);
4175
    dspfunc(put, 1, 8);
4176
    dspfunc(put_no_rnd, 1, 8);
4177
    dspfunc(put, 2, 4);
4178
    dspfunc(put, 3, 2);
4179

    
4180
    dspfunc(avg, 0, 16);
4181
    dspfunc(avg_no_rnd, 0, 16);
4182
    dspfunc(avg, 1, 8);
4183
    dspfunc(avg_no_rnd, 1, 8);
4184
    dspfunc(avg, 2, 4);
4185
    dspfunc(avg, 3, 2);
4186
#undef dspfunc
4187

    
4188
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4189
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4190

    
4191
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4192
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4193
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4194
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4195
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4196
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4197
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4198
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4199
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4200

    
4201
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4202
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4203
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4204
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4205
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4206
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4207
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4208
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4209
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4210

    
4211
#define dspfunc(PFX, IDX, NUM) \
4212
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4213
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4214
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4215
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4216
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4217
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4218
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4219
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4220
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4221
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4222
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4223
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4224
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4225
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4226
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4227
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4228

    
4229
    dspfunc(put_qpel, 0, 16);
4230
    dspfunc(put_no_rnd_qpel, 0, 16);
4231

    
4232
    dspfunc(avg_qpel, 0, 16);
4233
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4234

    
4235
    dspfunc(put_qpel, 1, 8);
4236
    dspfunc(put_no_rnd_qpel, 1, 8);
4237

    
4238
    dspfunc(avg_qpel, 1, 8);
4239
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4240

    
4241
    dspfunc(put_h264_qpel, 0, 16);
4242
    dspfunc(put_h264_qpel, 1, 8);
4243
    dspfunc(put_h264_qpel, 2, 4);
4244
    dspfunc(put_h264_qpel, 3, 2);
4245
    dspfunc(avg_h264_qpel, 0, 16);
4246
    dspfunc(avg_h264_qpel, 1, 8);
4247
    dspfunc(avg_h264_qpel, 2, 4);
4248

    
4249
#undef dspfunc
4250
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4251
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4252
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4253
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4254
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4255
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4256

    
4257
    c->draw_edges = draw_edges_c;
4258

    
4259
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4260
    ff_mlp_init(c, avctx);
4261
#endif
4262
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4263
    ff_intrax8dsp_init(c,avctx);
4264
#endif
4265
#if CONFIG_RV30_DECODER
4266
    ff_rv30dsp_init(c,avctx);
4267
#endif
4268
#if CONFIG_RV40_DECODER
4269
    ff_rv40dsp_init(c,avctx);
4270
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4271
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4272
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4273
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4274
#endif
4275

    
4276
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4277
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4278
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4279
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4280
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4281
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4282
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4283
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4284

    
4285
#define SET_CMP_FUNC(name) \
4286
    c->name[0]= name ## 16_c;\
4287
    c->name[1]= name ## 8x8_c;
4288

    
4289
    SET_CMP_FUNC(hadamard8_diff)
4290
    c->hadamard8_diff[4]= hadamard8_intra16_c;
4291
    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4292
    SET_CMP_FUNC(dct_sad)
4293
    SET_CMP_FUNC(dct_max)
4294
#if CONFIG_GPL
4295
    SET_CMP_FUNC(dct264_sad)
4296
#endif
4297
    c->sad[0]= pix_abs16_c;
4298
    c->sad[1]= pix_abs8_c;
4299