Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ d375c104

History | View | Annotate | Download (154 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* Input permutation for the simple_idct_mmx */
100
static const uint8_t simple_mmx_permutation[64]={
101
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109
};
110

    
111
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
112

    
113
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114
    int i;
115
    int end;
116

    
117
    st->scantable= src_scantable;
118

    
119
    for(i=0; i<64; i++){
120
        int j;
121
        j = src_scantable[i];
122
        st->permutated[i] = permutation[j];
123
#if ARCH_PPC
124
        st->inverse[j] = i;
125
#endif
126
    }
127

    
128
    end=-1;
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = st->permutated[i];
132
        if(j>end) end=j;
133
        st->raster_end[i]= end;
134
    }
135
}
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = ff_squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= av_bswap32(src[i+0]);
212
        dst[i+1]= av_bswap32(src[i+1]);
213
        dst[i+2]= av_bswap32(src[i+2]);
214
        dst[i+3]= av_bswap32(src[i+3]);
215
        dst[i+4]= av_bswap32(src[i+4]);
216
        dst[i+5]= av_bswap32(src[i+5]);
217
        dst[i+6]= av_bswap32(src[i+6]);
218
        dst[i+7]= av_bswap32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
    }
223
}
224

    
225
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
226
{
227
    while (len--)
228
        *dst++ = av_bswap16(*src++);
229
}
230

    
231
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232
{
233
    int s, i;
234
    uint32_t *sq = ff_squareTbl + 256;
235

    
236
    s = 0;
237
    for (i = 0; i < h; i++) {
238
        s += sq[pix1[0] - pix2[0]];
239
        s += sq[pix1[1] - pix2[1]];
240
        s += sq[pix1[2] - pix2[2]];
241
        s += sq[pix1[3] - pix2[3]];
242
        pix1 += line_size;
243
        pix2 += line_size;
244
    }
245
    return s;
246
}
247

    
248
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249
{
250
    int s, i;
251
    uint32_t *sq = ff_squareTbl + 256;
252

    
253
    s = 0;
254
    for (i = 0; i < h; i++) {
255
        s += sq[pix1[0] - pix2[0]];
256
        s += sq[pix1[1] - pix2[1]];
257
        s += sq[pix1[2] - pix2[2]];
258
        s += sq[pix1[3] - pix2[3]];
259
        s += sq[pix1[4] - pix2[4]];
260
        s += sq[pix1[5] - pix2[5]];
261
        s += sq[pix1[6] - pix2[6]];
262
        s += sq[pix1[7] - pix2[7]];
263
        pix1 += line_size;
264
        pix2 += line_size;
265
    }
266
    return s;
267
}
268

    
269
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270
{
271
    int s, i;
272
    uint32_t *sq = ff_squareTbl + 256;
273

    
274
    s = 0;
275
    for (i = 0; i < h; i++) {
276
        s += sq[pix1[ 0] - pix2[ 0]];
277
        s += sq[pix1[ 1] - pix2[ 1]];
278
        s += sq[pix1[ 2] - pix2[ 2]];
279
        s += sq[pix1[ 3] - pix2[ 3]];
280
        s += sq[pix1[ 4] - pix2[ 4]];
281
        s += sq[pix1[ 5] - pix2[ 5]];
282
        s += sq[pix1[ 6] - pix2[ 6]];
283
        s += sq[pix1[ 7] - pix2[ 7]];
284
        s += sq[pix1[ 8] - pix2[ 8]];
285
        s += sq[pix1[ 9] - pix2[ 9]];
286
        s += sq[pix1[10] - pix2[10]];
287
        s += sq[pix1[11] - pix2[11]];
288
        s += sq[pix1[12] - pix2[12]];
289
        s += sq[pix1[13] - pix2[13]];
290
        s += sq[pix1[14] - pix2[14]];
291
        s += sq[pix1[15] - pix2[15]];
292

    
293
        pix1 += line_size;
294
        pix2 += line_size;
295
    }
296
    return s;
297
}
298

    
299
/* draw the edges of width 'w' of an image of size width, height */
300
//FIXME check that this is ok for mpeg4 interlaced
301
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
302
{
303
    uint8_t *ptr, *last_line;
304
    int i;
305

    
306
    last_line = buf + (height - 1) * wrap;
307
    for(i=0;i<w;i++) {
308
        /* top and bottom */
309
        if (sides&EDGE_TOP)    memcpy(buf - (i + 1) * wrap, buf, width);
310
        if (sides&EDGE_BOTTOM) memcpy(last_line + (i + 1) * wrap, last_line, width);
311
    }
312
    /* left and right */
313
    ptr = buf;
314
    for(i=0;i<height;i++) {
315
        memset(ptr - w, ptr[0], w);
316
        memset(ptr + width, ptr[width-1], w);
317
        ptr += wrap;
318
    }
319
    /* corners */
320
    for(i=0;i<w;i++) {
321
        if (sides&EDGE_TOP) {
322
            memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
323
            memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
324
        }
325

    
326
        if (sides&EDGE_BOTTOM) {
327
            memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
328
            memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
329
        }
330
    }
331
}
332

    
333
/**
334
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
335
 * @param buf destination buffer
336
 * @param src source buffer
337
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
338
 * @param block_w width of block
339
 * @param block_h height of block
340
 * @param src_x x coordinate of the top left sample of the block in the source buffer
341
 * @param src_y y coordinate of the top left sample of the block in the source buffer
342
 * @param w width of the source buffer
343
 * @param h height of the source buffer
344
 */
345
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
346
                                    int src_x, int src_y, int w, int h){
347
    int x, y;
348
    int start_y, start_x, end_y, end_x;
349

    
350
    if(src_y>= h){
351
        src+= (h-1-src_y)*linesize;
352
        src_y=h-1;
353
    }else if(src_y<=-block_h){
354
        src+= (1-block_h-src_y)*linesize;
355
        src_y=1-block_h;
356
    }
357
    if(src_x>= w){
358
        src+= (w-1-src_x);
359
        src_x=w-1;
360
    }else if(src_x<=-block_w){
361
        src+= (1-block_w-src_x);
362
        src_x=1-block_w;
363
    }
364

    
365
    start_y= FFMAX(0, -src_y);
366
    start_x= FFMAX(0, -src_x);
367
    end_y= FFMIN(block_h, h-src_y);
368
    end_x= FFMIN(block_w, w-src_x);
369
    assert(start_y < end_y && block_h);
370
    assert(start_x < end_x && block_w);
371

    
372
    w    = end_x - start_x;
373
    src += start_y*linesize + start_x;
374
    buf += start_x;
375

    
376
    //top
377
    for(y=0; y<start_y; y++){
378
        memcpy(buf, src, w);
379
        buf += linesize;
380
    }
381

    
382
    // copy existing part
383
    for(; y<end_y; y++){
384
        memcpy(buf, src, w);
385
        src += linesize;
386
        buf += linesize;
387
    }
388

    
389
    //bottom
390
    src -= linesize;
391
    for(; y<block_h; y++){
392
        memcpy(buf, src, w);
393
        buf += linesize;
394
    }
395

    
396
    buf -= block_h * linesize + start_x;
397
    while (block_h--){
398
       //left
399
        for(x=0; x<start_x; x++){
400
            buf[x] = buf[start_x];
401
        }
402

    
403
       //right
404
        for(x=end_x; x<block_w; x++){
405
            buf[x] = buf[end_x - 1];
406
        }
407
        buf += linesize;
408
    }
409
}
410

    
411
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
412
{
413
    int i;
414

    
415
    /* read the pixels */
416
    for(i=0;i<8;i++) {
417
        block[0] = pixels[0];
418
        block[1] = pixels[1];
419
        block[2] = pixels[2];
420
        block[3] = pixels[3];
421
        block[4] = pixels[4];
422
        block[5] = pixels[5];
423
        block[6] = pixels[6];
424
        block[7] = pixels[7];
425
        pixels += line_size;
426
        block += 8;
427
    }
428
}
429

    
430
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
431
                          const uint8_t *s2, int stride){
432
    int i;
433

    
434
    /* read the pixels */
435
    for(i=0;i<8;i++) {
436
        block[0] = s1[0] - s2[0];
437
        block[1] = s1[1] - s2[1];
438
        block[2] = s1[2] - s2[2];
439
        block[3] = s1[3] - s2[3];
440
        block[4] = s1[4] - s2[4];
441
        block[5] = s1[5] - s2[5];
442
        block[6] = s1[6] - s2[6];
443
        block[7] = s1[7] - s2[7];
444
        s1 += stride;
445
        s2 += stride;
446
        block += 8;
447
    }
448
}
449

    
450

    
451
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
452
                             int line_size)
453
{
454
    int i;
455
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
456

    
457
    /* read the pixels */
458
    for(i=0;i<8;i++) {
459
        pixels[0] = cm[block[0]];
460
        pixels[1] = cm[block[1]];
461
        pixels[2] = cm[block[2]];
462
        pixels[3] = cm[block[3]];
463
        pixels[4] = cm[block[4]];
464
        pixels[5] = cm[block[5]];
465
        pixels[6] = cm[block[6]];
466
        pixels[7] = cm[block[7]];
467

    
468
        pixels += line_size;
469
        block += 8;
470
    }
471
}
472

    
473
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
474
                                 int line_size)
475
{
476
    int i;
477
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
478

    
479
    /* read the pixels */
480
    for(i=0;i<4;i++) {
481
        pixels[0] = cm[block[0]];
482
        pixels[1] = cm[block[1]];
483
        pixels[2] = cm[block[2]];
484
        pixels[3] = cm[block[3]];
485

    
486
        pixels += line_size;
487
        block += 8;
488
    }
489
}
490

    
491
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
492
                                 int line_size)
493
{
494
    int i;
495
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
496

    
497
    /* read the pixels */
498
    for(i=0;i<2;i++) {
499
        pixels[0] = cm[block[0]];
500
        pixels[1] = cm[block[1]];
501

    
502
        pixels += line_size;
503
        block += 8;
504
    }
505
}
506

    
507
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
508
                                    uint8_t *restrict pixels,
509
                                    int line_size)
510
{
511
    int i, j;
512

    
513
    for (i = 0; i < 8; i++) {
514
        for (j = 0; j < 8; j++) {
515
            if (*block < -128)
516
                *pixels = 0;
517
            else if (*block > 127)
518
                *pixels = 255;
519
            else
520
                *pixels = (uint8_t)(*block + 128);
521
            block++;
522
            pixels++;
523
        }
524
        pixels += (line_size - 8);
525
    }
526
}
527

    
528
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
529
                                    int line_size)
530
{
531
    int i;
532

    
533
    /* read the pixels */
534
    for(i=0;i<8;i++) {
535
        pixels[0] = block[0];
536
        pixels[1] = block[1];
537
        pixels[2] = block[2];
538
        pixels[3] = block[3];
539
        pixels[4] = block[4];
540
        pixels[5] = block[5];
541
        pixels[6] = block[6];
542
        pixels[7] = block[7];
543

    
544
        pixels += line_size;
545
        block += 8;
546
    }
547
}
548

    
549
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
550
                             int line_size)
551
{
552
    int i;
553
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
554

    
555
    /* read the pixels */
556
    for(i=0;i<8;i++) {
557
        pixels[0] = cm[pixels[0] + block[0]];
558
        pixels[1] = cm[pixels[1] + block[1]];
559
        pixels[2] = cm[pixels[2] + block[2]];
560
        pixels[3] = cm[pixels[3] + block[3]];
561
        pixels[4] = cm[pixels[4] + block[4]];
562
        pixels[5] = cm[pixels[5] + block[5]];
563
        pixels[6] = cm[pixels[6] + block[6]];
564
        pixels[7] = cm[pixels[7] + block[7]];
565
        pixels += line_size;
566
        block += 8;
567
    }
568
}
569

    
570
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
571
                          int line_size)
572
{
573
    int i;
574
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
575

    
576
    /* read the pixels */
577
    for(i=0;i<4;i++) {
578
        pixels[0] = cm[pixels[0] + block[0]];
579
        pixels[1] = cm[pixels[1] + block[1]];
580
        pixels[2] = cm[pixels[2] + block[2]];
581
        pixels[3] = cm[pixels[3] + block[3]];
582
        pixels += line_size;
583
        block += 8;
584
    }
585
}
586

    
587
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
588
                          int line_size)
589
{
590
    int i;
591
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
592

    
593
    /* read the pixels */
594
    for(i=0;i<2;i++) {
595
        pixels[0] = cm[pixels[0] + block[0]];
596
        pixels[1] = cm[pixels[1] + block[1]];
597
        pixels += line_size;
598
        block += 8;
599
    }
600
}
601

    
602
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
603
{
604
    int i;
605
    for(i=0;i<8;i++) {
606
        pixels[0] += block[0];
607
        pixels[1] += block[1];
608
        pixels[2] += block[2];
609
        pixels[3] += block[3];
610
        pixels[4] += block[4];
611
        pixels[5] += block[5];
612
        pixels[6] += block[6];
613
        pixels[7] += block[7];
614
        pixels += line_size;
615
        block += 8;
616
    }
617
}
618

    
619
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
620
{
621
    int i;
622
    for(i=0;i<4;i++) {
623
        pixels[0] += block[0];
624
        pixels[1] += block[1];
625
        pixels[2] += block[2];
626
        pixels[3] += block[3];
627
        pixels += line_size;
628
        block += 4;
629
    }
630
}
631

    
632
static int sum_abs_dctelem_c(DCTELEM *block)
633
{
634
    int sum=0, i;
635
    for(i=0; i<64; i++)
636
        sum+= FFABS(block[i]);
637
    return sum;
638
}
639

    
640
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
641
{
642
    int i;
643

    
644
    for (i = 0; i < h; i++) {
645
        memset(block, value, 16);
646
        block += line_size;
647
    }
648
}
649

    
650
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
651
{
652
    int i;
653

    
654
    for (i = 0; i < h; i++) {
655
        memset(block, value, 8);
656
        block += line_size;
657
    }
658
}
659

    
660
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
661
{
662
    int i, j;
663
    uint16_t *dst1 = (uint16_t *) dst;
664
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
665

    
666
    for (j = 0; j < 8; j++) {
667
        for (i = 0; i < 8; i++) {
668
            dst1[i] = dst2[i] = src[i] * 0x0101;
669
        }
670
        src  += 8;
671
        dst1 += linesize;
672
        dst2 += linesize;
673
    }
674
}
675

    
676
#if 0
677

678
#define PIXOP2(OPNAME, OP) \
679
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
680
{\
681
    int i;\
682
    for(i=0; i<h; i++){\
683
        OP(*((uint64_t*)block), AV_RN64(pixels));\
684
        pixels+=line_size;\
685
        block +=line_size;\
686
    }\
687
}\
688
\
689
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
690
{\
691
    int i;\
692
    for(i=0; i<h; i++){\
693
        const uint64_t a= AV_RN64(pixels  );\
694
        const uint64_t b= AV_RN64(pixels+1);\
695
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
696
        pixels+=line_size;\
697
        block +=line_size;\
698
    }\
699
}\
700
\
701
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
702
{\
703
    int i;\
704
    for(i=0; i<h; i++){\
705
        const uint64_t a= AV_RN64(pixels  );\
706
        const uint64_t b= AV_RN64(pixels+1);\
707
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
708
        pixels+=line_size;\
709
        block +=line_size;\
710
    }\
711
}\
712
\
713
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
714
{\
715
    int i;\
716
    for(i=0; i<h; i++){\
717
        const uint64_t a= AV_RN64(pixels          );\
718
        const uint64_t b= AV_RN64(pixels+line_size);\
719
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
720
        pixels+=line_size;\
721
        block +=line_size;\
722
    }\
723
}\
724
\
725
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
726
{\
727
    int i;\
728
    for(i=0; i<h; i++){\
729
        const uint64_t a= AV_RN64(pixels          );\
730
        const uint64_t b= AV_RN64(pixels+line_size);\
731
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
732
        pixels+=line_size;\
733
        block +=line_size;\
734
    }\
735
}\
736
\
737
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
738
{\
739
        int i;\
740
        const uint64_t a= AV_RN64(pixels  );\
741
        const uint64_t b= AV_RN64(pixels+1);\
742
        uint64_t l0=  (a&0x0303030303030303ULL)\
743
                    + (b&0x0303030303030303ULL)\
744
                    + 0x0202020202020202ULL;\
745
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
746
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
747
        uint64_t l1,h1;\
748
\
749
        pixels+=line_size;\
750
        for(i=0; i<h; i+=2){\
751
            uint64_t a= AV_RN64(pixels  );\
752
            uint64_t b= AV_RN64(pixels+1);\
753
            l1=  (a&0x0303030303030303ULL)\
754
               + (b&0x0303030303030303ULL);\
755
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
756
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
757
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
758
            pixels+=line_size;\
759
            block +=line_size;\
760
            a= AV_RN64(pixels  );\
761
            b= AV_RN64(pixels+1);\
762
            l0=  (a&0x0303030303030303ULL)\
763
               + (b&0x0303030303030303ULL)\
764
               + 0x0202020202020202ULL;\
765
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
767
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
768
            pixels+=line_size;\
769
            block +=line_size;\
770
        }\
771
}\
772
\
773
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774
{\
775
        int i;\
776
        const uint64_t a= AV_RN64(pixels  );\
777
        const uint64_t b= AV_RN64(pixels+1);\
778
        uint64_t l0=  (a&0x0303030303030303ULL)\
779
                    + (b&0x0303030303030303ULL)\
780
                    + 0x0101010101010101ULL;\
781
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
782
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
783
        uint64_t l1,h1;\
784
\
785
        pixels+=line_size;\
786
        for(i=0; i<h; i+=2){\
787
            uint64_t a= AV_RN64(pixels  );\
788
            uint64_t b= AV_RN64(pixels+1);\
789
            l1=  (a&0x0303030303030303ULL)\
790
               + (b&0x0303030303030303ULL);\
791
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
792
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
793
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
794
            pixels+=line_size;\
795
            block +=line_size;\
796
            a= AV_RN64(pixels  );\
797
            b= AV_RN64(pixels+1);\
798
            l0=  (a&0x0303030303030303ULL)\
799
               + (b&0x0303030303030303ULL)\
800
               + 0x0101010101010101ULL;\
801
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
802
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
803
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
804
            pixels+=line_size;\
805
            block +=line_size;\
806
        }\
807
}\
808
\
809
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
810
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
811
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
812
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
813
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
814
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
815
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
816

817
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
818
#else // 64 bit variant
819

    
820
#define PIXOP2(OPNAME, OP) \
821
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
822
    int i;\
823
    for(i=0; i<h; i++){\
824
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
825
        pixels+=line_size;\
826
        block +=line_size;\
827
    }\
828
}\
829
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830
    int i;\
831
    for(i=0; i<h; i++){\
832
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
833
        pixels+=line_size;\
834
        block +=line_size;\
835
    }\
836
}\
837
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
838
    int i;\
839
    for(i=0; i<h; i++){\
840
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
841
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
842
        pixels+=line_size;\
843
        block +=line_size;\
844
    }\
845
}\
846
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
847
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
848
}\
849
\
850
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
851
                                                int src_stride1, int src_stride2, int h){\
852
    int i;\
853
    for(i=0; i<h; i++){\
854
        uint32_t a,b;\
855
        a= AV_RN32(&src1[i*src_stride1  ]);\
856
        b= AV_RN32(&src2[i*src_stride2  ]);\
857
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
858
        a= AV_RN32(&src1[i*src_stride1+4]);\
859
        b= AV_RN32(&src2[i*src_stride2+4]);\
860
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
861
    }\
862
}\
863
\
864
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
865
                                                int src_stride1, int src_stride2, int h){\
866
    int i;\
867
    for(i=0; i<h; i++){\
868
        uint32_t a,b;\
869
        a= AV_RN32(&src1[i*src_stride1  ]);\
870
        b= AV_RN32(&src2[i*src_stride2  ]);\
871
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
872
        a= AV_RN32(&src1[i*src_stride1+4]);\
873
        b= AV_RN32(&src2[i*src_stride2+4]);\
874
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
875
    }\
876
}\
877
\
878
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
879
                                                int src_stride1, int src_stride2, int h){\
880
    int i;\
881
    for(i=0; i<h; i++){\
882
        uint32_t a,b;\
883
        a= AV_RN32(&src1[i*src_stride1  ]);\
884
        b= AV_RN32(&src2[i*src_stride2  ]);\
885
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
886
    }\
887
}\
888
\
889
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890
                                                int src_stride1, int src_stride2, int h){\
891
    int i;\
892
    for(i=0; i<h; i++){\
893
        uint32_t a,b;\
894
        a= AV_RN16(&src1[i*src_stride1  ]);\
895
        b= AV_RN16(&src2[i*src_stride2  ]);\
896
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
897
    }\
898
}\
899
\
900
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
901
                                                int src_stride1, int src_stride2, int h){\
902
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
903
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
904
}\
905
\
906
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
907
                                                int src_stride1, int src_stride2, int h){\
908
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
909
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
910
}\
911
\
912
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
913
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
914
}\
915
\
916
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
918
}\
919
\
920
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
922
}\
923
\
924
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
925
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
926
}\
927
\
928
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
929
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
930
    int i;\
931
    for(i=0; i<h; i++){\
932
        uint32_t a, b, c, d, l0, l1, h0, h1;\
933
        a= AV_RN32(&src1[i*src_stride1]);\
934
        b= AV_RN32(&src2[i*src_stride2]);\
935
        c= AV_RN32(&src3[i*src_stride3]);\
936
        d= AV_RN32(&src4[i*src_stride4]);\
937
        l0=  (a&0x03030303UL)\
938
           + (b&0x03030303UL)\
939
           + 0x02020202UL;\
940
        h0= ((a&0xFCFCFCFCUL)>>2)\
941
          + ((b&0xFCFCFCFCUL)>>2);\
942
        l1=  (c&0x03030303UL)\
943
           + (d&0x03030303UL);\
944
        h1= ((c&0xFCFCFCFCUL)>>2)\
945
          + ((d&0xFCFCFCFCUL)>>2);\
946
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
947
        a= AV_RN32(&src1[i*src_stride1+4]);\
948
        b= AV_RN32(&src2[i*src_stride2+4]);\
949
        c= AV_RN32(&src3[i*src_stride3+4]);\
950
        d= AV_RN32(&src4[i*src_stride4+4]);\
951
        l0=  (a&0x03030303UL)\
952
           + (b&0x03030303UL)\
953
           + 0x02020202UL;\
954
        h0= ((a&0xFCFCFCFCUL)>>2)\
955
          + ((b&0xFCFCFCFCUL)>>2);\
956
        l1=  (c&0x03030303UL)\
957
           + (d&0x03030303UL);\
958
        h1= ((c&0xFCFCFCFCUL)>>2)\
959
          + ((d&0xFCFCFCFCUL)>>2);\
960
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
961
    }\
962
}\
963
\
964
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
965
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
966
}\
967
\
968
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
969
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
970
}\
971
\
972
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
973
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
974
}\
975
\
976
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
977
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
978
}\
979
\
980
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
981
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
982
    int i;\
983
    for(i=0; i<h; i++){\
984
        uint32_t a, b, c, d, l0, l1, h0, h1;\
985
        a= AV_RN32(&src1[i*src_stride1]);\
986
        b= AV_RN32(&src2[i*src_stride2]);\
987
        c= AV_RN32(&src3[i*src_stride3]);\
988
        d= AV_RN32(&src4[i*src_stride4]);\
989
        l0=  (a&0x03030303UL)\
990
           + (b&0x03030303UL)\
991
           + 0x01010101UL;\
992
        h0= ((a&0xFCFCFCFCUL)>>2)\
993
          + ((b&0xFCFCFCFCUL)>>2);\
994
        l1=  (c&0x03030303UL)\
995
           + (d&0x03030303UL);\
996
        h1= ((c&0xFCFCFCFCUL)>>2)\
997
          + ((d&0xFCFCFCFCUL)>>2);\
998
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999
        a= AV_RN32(&src1[i*src_stride1+4]);\
1000
        b= AV_RN32(&src2[i*src_stride2+4]);\
1001
        c= AV_RN32(&src3[i*src_stride3+4]);\
1002
        d= AV_RN32(&src4[i*src_stride4+4]);\
1003
        l0=  (a&0x03030303UL)\
1004
           + (b&0x03030303UL)\
1005
           + 0x01010101UL;\
1006
        h0= ((a&0xFCFCFCFCUL)>>2)\
1007
          + ((b&0xFCFCFCFCUL)>>2);\
1008
        l1=  (c&0x03030303UL)\
1009
           + (d&0x03030303UL);\
1010
        h1= ((c&0xFCFCFCFCUL)>>2)\
1011
          + ((d&0xFCFCFCFCUL)>>2);\
1012
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1013
    }\
1014
}\
1015
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1016
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1017
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1019
}\
1020
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1021
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1022
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1023
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1024
}\
1025
\
1026
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1027
{\
1028
        int i, a0, b0, a1, b1;\
1029
        a0= pixels[0];\
1030
        b0= pixels[1] + 2;\
1031
        a0 += b0;\
1032
        b0 += pixels[2];\
1033
\
1034
        pixels+=line_size;\
1035
        for(i=0; i<h; i+=2){\
1036
            a1= pixels[0];\
1037
            b1= pixels[1];\
1038
            a1 += b1;\
1039
            b1 += pixels[2];\
1040
\
1041
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1042
            block[1]= (b1+b0)>>2;\
1043
\
1044
            pixels+=line_size;\
1045
            block +=line_size;\
1046
\
1047
            a0= pixels[0];\
1048
            b0= pixels[1] + 2;\
1049
            a0 += b0;\
1050
            b0 += pixels[2];\
1051
\
1052
            block[0]= (a1+a0)>>2;\
1053
            block[1]= (b1+b0)>>2;\
1054
            pixels+=line_size;\
1055
            block +=line_size;\
1056
        }\
1057
}\
1058
\
1059
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1060
{\
1061
        int i;\
1062
        const uint32_t a= AV_RN32(pixels  );\
1063
        const uint32_t b= AV_RN32(pixels+1);\
1064
        uint32_t l0=  (a&0x03030303UL)\
1065
                    + (b&0x03030303UL)\
1066
                    + 0x02020202UL;\
1067
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1068
                   + ((b&0xFCFCFCFCUL)>>2);\
1069
        uint32_t l1,h1;\
1070
\
1071
        pixels+=line_size;\
1072
        for(i=0; i<h; i+=2){\
1073
            uint32_t a= AV_RN32(pixels  );\
1074
            uint32_t b= AV_RN32(pixels+1);\
1075
            l1=  (a&0x03030303UL)\
1076
               + (b&0x03030303UL);\
1077
            h1= ((a&0xFCFCFCFCUL)>>2)\
1078
              + ((b&0xFCFCFCFCUL)>>2);\
1079
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080
            pixels+=line_size;\
1081
            block +=line_size;\
1082
            a= AV_RN32(pixels  );\
1083
            b= AV_RN32(pixels+1);\
1084
            l0=  (a&0x03030303UL)\
1085
               + (b&0x03030303UL)\
1086
               + 0x02020202UL;\
1087
            h0= ((a&0xFCFCFCFCUL)>>2)\
1088
              + ((b&0xFCFCFCFCUL)>>2);\
1089
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090
            pixels+=line_size;\
1091
            block +=line_size;\
1092
        }\
1093
}\
1094
\
1095
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1096
{\
1097
    int j;\
1098
    for(j=0; j<2; j++){\
1099
        int i;\
1100
        const uint32_t a= AV_RN32(pixels  );\
1101
        const uint32_t b= AV_RN32(pixels+1);\
1102
        uint32_t l0=  (a&0x03030303UL)\
1103
                    + (b&0x03030303UL)\
1104
                    + 0x02020202UL;\
1105
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1106
                   + ((b&0xFCFCFCFCUL)>>2);\
1107
        uint32_t l1,h1;\
1108
\
1109
        pixels+=line_size;\
1110
        for(i=0; i<h; i+=2){\
1111
            uint32_t a= AV_RN32(pixels  );\
1112
            uint32_t b= AV_RN32(pixels+1);\
1113
            l1=  (a&0x03030303UL)\
1114
               + (b&0x03030303UL);\
1115
            h1= ((a&0xFCFCFCFCUL)>>2)\
1116
              + ((b&0xFCFCFCFCUL)>>2);\
1117
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1118
            pixels+=line_size;\
1119
            block +=line_size;\
1120
            a= AV_RN32(pixels  );\
1121
            b= AV_RN32(pixels+1);\
1122
            l0=  (a&0x03030303UL)\
1123
               + (b&0x03030303UL)\
1124
               + 0x02020202UL;\
1125
            h0= ((a&0xFCFCFCFCUL)>>2)\
1126
              + ((b&0xFCFCFCFCUL)>>2);\
1127
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1128
            pixels+=line_size;\
1129
            block +=line_size;\
1130
        }\
1131
        pixels+=4-line_size*(h+1);\
1132
        block +=4-line_size*h;\
1133
    }\
1134
}\
1135
\
1136
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1137
{\
1138
    int j;\
1139
    for(j=0; j<2; j++){\
1140
        int i;\
1141
        const uint32_t a= AV_RN32(pixels  );\
1142
        const uint32_t b= AV_RN32(pixels+1);\
1143
        uint32_t l0=  (a&0x03030303UL)\
1144
                    + (b&0x03030303UL)\
1145
                    + 0x01010101UL;\
1146
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1147
                   + ((b&0xFCFCFCFCUL)>>2);\
1148
        uint32_t l1,h1;\
1149
\
1150
        pixels+=line_size;\
1151
        for(i=0; i<h; i+=2){\
1152
            uint32_t a= AV_RN32(pixels  );\
1153
            uint32_t b= AV_RN32(pixels+1);\
1154
            l1=  (a&0x03030303UL)\
1155
               + (b&0x03030303UL);\
1156
            h1= ((a&0xFCFCFCFCUL)>>2)\
1157
              + ((b&0xFCFCFCFCUL)>>2);\
1158
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1159
            pixels+=line_size;\
1160
            block +=line_size;\
1161
            a= AV_RN32(pixels  );\
1162
            b= AV_RN32(pixels+1);\
1163
            l0=  (a&0x03030303UL)\
1164
               + (b&0x03030303UL)\
1165
               + 0x01010101UL;\
1166
            h0= ((a&0xFCFCFCFCUL)>>2)\
1167
              + ((b&0xFCFCFCFCUL)>>2);\
1168
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1169
            pixels+=line_size;\
1170
            block +=line_size;\
1171
        }\
1172
        pixels+=4-line_size*(h+1);\
1173
        block +=4-line_size*h;\
1174
    }\
1175
}\
1176
\
1177
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1178
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1179
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1180
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1181
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1182
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1183
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1184
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1185

    
1186
#define op_avg(a, b) a = rnd_avg32(a, b)
1187
#endif
1188
#define op_put(a, b) a = b
1189

    
1190
PIXOP2(avg, op_avg)
1191
PIXOP2(put, op_put)
1192
#undef op_avg
1193
#undef op_put
1194

    
1195
#define put_no_rnd_pixels8_c  put_pixels8_c
1196
#define put_no_rnd_pixels16_c put_pixels16_c
1197

    
1198
#define avg2(a,b) ((a+b+1)>>1)
1199
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1200

    
1201
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1202
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1203
}
1204

    
1205
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1206
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1207
}
1208

    
1209
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1210
{
1211
    const int A=(16-x16)*(16-y16);
1212
    const int B=(   x16)*(16-y16);
1213
    const int C=(16-x16)*(   y16);
1214
    const int D=(   x16)*(   y16);
1215
    int i;
1216

    
1217
    for(i=0; i<h; i++)
1218
    {
1219
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1220
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1221
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1222
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1223
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1224
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1225
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1226
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1227
        dst+= stride;
1228
        src+= stride;
1229
    }
1230
}
1231

    
1232
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1233
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1234
{
1235
    int y, vx, vy;
1236
    const int s= 1<<shift;
1237

    
1238
    width--;
1239
    height--;
1240

    
1241
    for(y=0; y<h; y++){
1242
        int x;
1243

    
1244
        vx= ox;
1245
        vy= oy;
1246
        for(x=0; x<8; x++){ //XXX FIXME optimize
1247
            int src_x, src_y, frac_x, frac_y, index;
1248

    
1249
            src_x= vx>>16;
1250
            src_y= vy>>16;
1251
            frac_x= src_x&(s-1);
1252
            frac_y= src_y&(s-1);
1253
            src_x>>=shift;
1254
            src_y>>=shift;
1255

    
1256
            if((unsigned)src_x < width){
1257
                if((unsigned)src_y < height){
1258
                    index= src_x + src_y*stride;
1259
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1260
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1261
                                        + (  src[index+stride  ]*(s-frac_x)
1262
                                           + src[index+stride+1]*   frac_x )*   frac_y
1263
                                        + r)>>(shift*2);
1264
                }else{
1265
                    index= src_x + av_clip(src_y, 0, height)*stride;
1266
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1267
                                          + src[index       +1]*   frac_x )*s
1268
                                        + r)>>(shift*2);
1269
                }
1270
            }else{
1271
                if((unsigned)src_y < height){
1272
                    index= av_clip(src_x, 0, width) + src_y*stride;
1273
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1274
                                           + src[index+stride  ]*   frac_y )*s
1275
                                        + r)>>(shift*2);
1276
                }else{
1277
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1278
                    dst[y*stride + x]=    src[index         ];
1279
                }
1280
            }
1281

    
1282
            vx+= dxx;
1283
            vy+= dyx;
1284
        }
1285
        ox += dxy;
1286
        oy += dyy;
1287
    }
1288
}
1289

    
1290
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291
    switch(width){
1292
    case 2: put_pixels2_c (dst, src, stride, height); break;
1293
    case 4: put_pixels4_c (dst, src, stride, height); break;
1294
    case 8: put_pixels8_c (dst, src, stride, height); break;
1295
    case 16:put_pixels16_c(dst, src, stride, height); break;
1296
    }
1297
}
1298

    
1299
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300
    int i,j;
1301
    for (i=0; i < height; i++) {
1302
      for (j=0; j < width; j++) {
1303
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1304
      }
1305
      src += stride;
1306
      dst += stride;
1307
    }
1308
}
1309

    
1310
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311
    int i,j;
1312
    for (i=0; i < height; i++) {
1313
      for (j=0; j < width; j++) {
1314
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1315
      }
1316
      src += stride;
1317
      dst += stride;
1318
    }
1319
}
1320

    
1321
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    int i,j;
1323
    for (i=0; i < height; i++) {
1324
      for (j=0; j < width; j++) {
1325
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1326
      }
1327
      src += stride;
1328
      dst += stride;
1329
    }
1330
}
1331

    
1332
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    int i,j;
1378
    for (i=0; i < height; i++) {
1379
      for (j=0; j < width; j++) {
1380
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1381
      }
1382
      src += stride;
1383
      dst += stride;
1384
    }
1385
}
1386

    
1387
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388
    switch(width){
1389
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1390
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1391
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1392
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1393
    }
1394
}
1395

    
1396
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397
    int i,j;
1398
    for (i=0; i < height; i++) {
1399
      for (j=0; j < width; j++) {
1400
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1401
      }
1402
      src += stride;
1403
      dst += stride;
1404
    }
1405
}
1406

    
1407
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408
    int i,j;
1409
    for (i=0; i < height; i++) {
1410
      for (j=0; j < width; j++) {
1411
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1412
      }
1413
      src += stride;
1414
      dst += stride;
1415
    }
1416
}
1417

    
1418
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419
    int i,j;
1420
    for (i=0; i < height; i++) {
1421
      for (j=0; j < width; j++) {
1422
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1423
      }
1424
      src += stride;
1425
      dst += stride;
1426
    }
1427
}
1428

    
1429
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430
    int i,j;
1431
    for (i=0; i < height; i++) {
1432
      for (j=0; j < width; j++) {
1433
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1434
      }
1435
      src += stride;
1436
      dst += stride;
1437
    }
1438
}
1439

    
1440
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441
    int i,j;
1442
    for (i=0; i < height; i++) {
1443
      for (j=0; j < width; j++) {
1444
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1445
      }
1446
      src += stride;
1447
      dst += stride;
1448
    }
1449
}
1450

    
1451
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452
    int i,j;
1453
    for (i=0; i < height; i++) {
1454
      for (j=0; j < width; j++) {
1455
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1456
      }
1457
      src += stride;
1458
      dst += stride;
1459
    }
1460
}
1461

    
1462
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1463
    int i,j;
1464
    for (i=0; i < height; i++) {
1465
      for (j=0; j < width; j++) {
1466
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1467
      }
1468
      src += stride;
1469
      dst += stride;
1470
    }
1471
}
1472

    
1473
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1474
    int i,j;
1475
    for (i=0; i < height; i++) {
1476
      for (j=0; j < width; j++) {
1477
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1478
      }
1479
      src += stride;
1480
      dst += stride;
1481
    }
1482
}
1483
#if 0
1484
#define TPEL_WIDTH(width)\
1485
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1487
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1488
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1489
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1490
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1491
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1492
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1493
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1494
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1495
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1496
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1497
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1498
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1499
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1500
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1501
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1502
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1503
#endif
1504

    
1505
#define H264_CHROMA_MC(OPNAME, OP)\
1506
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1507
    const int A=(8-x)*(8-y);\
1508
    const int B=(  x)*(8-y);\
1509
    const int C=(8-x)*(  y);\
1510
    const int D=(  x)*(  y);\
1511
    int i;\
1512
    \
1513
    assert(x<8 && y<8 && x>=0 && y>=0);\
1514
\
1515
    if(D){\
1516
        for(i=0; i<h; i++){\
1517
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1518
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1519
            dst+= stride;\
1520
            src+= stride;\
1521
        }\
1522
    }else{\
1523
        const int E= B+C;\
1524
        const int step= C ? stride : 1;\
1525
        for(i=0; i<h; i++){\
1526
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1527
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1528
            dst+= stride;\
1529
            src+= stride;\
1530
        }\
1531
    }\
1532
}\
1533
\
1534
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1535
    const int A=(8-x)*(8-y);\
1536
    const int B=(  x)*(8-y);\
1537
    const int C=(8-x)*(  y);\
1538
    const int D=(  x)*(  y);\
1539
    int i;\
1540
    \
1541
    assert(x<8 && y<8 && x>=0 && y>=0);\
1542
\
1543
    if(D){\
1544
        for(i=0; i<h; i++){\
1545
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1546
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1547
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1548
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1549
            dst+= stride;\
1550
            src+= stride;\
1551
        }\
1552
    }else{\
1553
        const int E= B+C;\
1554
        const int step= C ? stride : 1;\
1555
        for(i=0; i<h; i++){\
1556
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1557
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1558
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1559
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1560
            dst+= stride;\
1561
            src+= stride;\
1562
        }\
1563
    }\
1564
}\
1565
\
1566
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1567
    const int A=(8-x)*(8-y);\
1568
    const int B=(  x)*(8-y);\
1569
    const int C=(8-x)*(  y);\
1570
    const int D=(  x)*(  y);\
1571
    int i;\
1572
    \
1573
    assert(x<8 && y<8 && x>=0 && y>=0);\
1574
\
1575
    if(D){\
1576
        for(i=0; i<h; i++){\
1577
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1578
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1579
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1580
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1581
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1582
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1583
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1584
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1585
            dst+= stride;\
1586
            src+= stride;\
1587
        }\
1588
    }else{\
1589
        const int E= B+C;\
1590
        const int step= C ? stride : 1;\
1591
        for(i=0; i<h; i++){\
1592
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1593
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1594
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1595
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1596
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1597
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1598
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1599
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1600
            dst+= stride;\
1601
            src+= stride;\
1602
        }\
1603
    }\
1604
}
1605

    
1606
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1607
#define op_put(a, b) a = (((b) + 32)>>6)
1608

    
1609
H264_CHROMA_MC(put_       , op_put)
1610
H264_CHROMA_MC(avg_       , op_avg)
1611
#undef op_avg
1612
#undef op_put
1613

    
1614
#define QPEL_MC(r, OPNAME, RND, OP) \
1615
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1617
    int i;\
1618
    for(i=0; i<h; i++)\
1619
    {\
1620
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1621
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1622
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1623
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1624
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1625
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1626
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1627
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1628
        dst+=dstStride;\
1629
        src+=srcStride;\
1630
    }\
1631
}\
1632
\
1633
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1634
    const int w=8;\
1635
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1636
    int i;\
1637
    for(i=0; i<w; i++)\
1638
    {\
1639
        const int src0= src[0*srcStride];\
1640
        const int src1= src[1*srcStride];\
1641
        const int src2= src[2*srcStride];\
1642
        const int src3= src[3*srcStride];\
1643
        const int src4= src[4*srcStride];\
1644
        const int src5= src[5*srcStride];\
1645
        const int src6= src[6*srcStride];\
1646
        const int src7= src[7*srcStride];\
1647
        const int src8= src[8*srcStride];\
1648
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1649
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1650
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1651
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1652
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1653
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1654
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1655
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1656
        dst++;\
1657
        src++;\
1658
    }\
1659
}\
1660
\
1661
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1662
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1663
    int i;\
1664
    \
1665
    for(i=0; i<h; i++)\
1666
    {\
1667
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1668
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1669
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1670
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1671
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1672
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1673
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1674
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1675
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1676
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1677
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1678
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1679
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1680
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1681
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1682
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1683
        dst+=dstStride;\
1684
        src+=srcStride;\
1685
    }\
1686
}\
1687
\
1688
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1689
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1690
    int i;\
1691
    const int w=16;\
1692
    for(i=0; i<w; i++)\
1693
    {\
1694
        const int src0= src[0*srcStride];\
1695
        const int src1= src[1*srcStride];\
1696
        const int src2= src[2*srcStride];\
1697
        const int src3= src[3*srcStride];\
1698
        const int src4= src[4*srcStride];\
1699
        const int src5= src[5*srcStride];\
1700
        const int src6= src[6*srcStride];\
1701
        const int src7= src[7*srcStride];\
1702
        const int src8= src[8*srcStride];\
1703
        const int src9= src[9*srcStride];\
1704
        const int src10= src[10*srcStride];\
1705
        const int src11= src[11*srcStride];\
1706
        const int src12= src[12*srcStride];\
1707
        const int src13= src[13*srcStride];\
1708
        const int src14= src[14*srcStride];\
1709
        const int src15= src[15*srcStride];\
1710
        const int src16= src[16*srcStride];\
1711
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1712
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1713
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1714
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1715
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1716
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1717
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1718
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1719
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1720
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1721
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1722
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1723
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1724
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1725
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1726
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1727
        dst++;\
1728
        src++;\
1729
    }\
1730
}\
1731
\
1732
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1733
    uint8_t half[64];\
1734
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1735
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1736
}\
1737
\
1738
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1739
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1740
}\
1741
\
1742
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1743
    uint8_t half[64];\
1744
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1745
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1746
}\
1747
\
1748
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1749
    uint8_t full[16*9];\
1750
    uint8_t half[64];\
1751
    copy_block9(full, src, 16, stride, 9);\
1752
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1753
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1754
}\
1755
\
1756
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1757
    uint8_t full[16*9];\
1758
    copy_block9(full, src, 16, stride, 9);\
1759
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1760
}\
1761
\
1762
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1763
    uint8_t full[16*9];\
1764
    uint8_t half[64];\
1765
    copy_block9(full, src, 16, stride, 9);\
1766
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1767
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1768
}\
1769
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770
    uint8_t full[16*9];\
1771
    uint8_t halfH[72];\
1772
    uint8_t halfV[64];\
1773
    uint8_t halfHV[64];\
1774
    copy_block9(full, src, 16, stride, 9);\
1775
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779
}\
1780
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1781
    uint8_t full[16*9];\
1782
    uint8_t halfH[72];\
1783
    uint8_t halfHV[64];\
1784
    copy_block9(full, src, 16, stride, 9);\
1785
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1789
}\
1790
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791
    uint8_t full[16*9];\
1792
    uint8_t halfH[72];\
1793
    uint8_t halfV[64];\
1794
    uint8_t halfHV[64];\
1795
    copy_block9(full, src, 16, stride, 9);\
1796
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1797
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800
}\
1801
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1802
    uint8_t full[16*9];\
1803
    uint8_t halfH[72];\
1804
    uint8_t halfHV[64];\
1805
    copy_block9(full, src, 16, stride, 9);\
1806
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1810
}\
1811
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1812
    uint8_t full[16*9];\
1813
    uint8_t halfH[72];\
1814
    uint8_t halfV[64];\
1815
    uint8_t halfHV[64];\
1816
    copy_block9(full, src, 16, stride, 9);\
1817
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1819
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1821
}\
1822
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1823
    uint8_t full[16*9];\
1824
    uint8_t halfH[72];\
1825
    uint8_t halfHV[64];\
1826
    copy_block9(full, src, 16, stride, 9);\
1827
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1829
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1831
}\
1832
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833
    uint8_t full[16*9];\
1834
    uint8_t halfH[72];\
1835
    uint8_t halfV[64];\
1836
    uint8_t halfHV[64];\
1837
    copy_block9(full, src, 16, stride, 9);\
1838
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1839
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1842
}\
1843
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1844
    uint8_t full[16*9];\
1845
    uint8_t halfH[72];\
1846
    uint8_t halfHV[64];\
1847
    copy_block9(full, src, 16, stride, 9);\
1848
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1850
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1852
}\
1853
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1854
    uint8_t halfH[72];\
1855
    uint8_t halfHV[64];\
1856
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1857
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1859
}\
1860
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1861
    uint8_t halfH[72];\
1862
    uint8_t halfHV[64];\
1863
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1864
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1866
}\
1867
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1868
    uint8_t full[16*9];\
1869
    uint8_t halfH[72];\
1870
    uint8_t halfV[64];\
1871
    uint8_t halfHV[64];\
1872
    copy_block9(full, src, 16, stride, 9);\
1873
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1874
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1875
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1876
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1877
}\
1878
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    uint8_t full[16*9];\
1880
    uint8_t halfH[72];\
1881
    copy_block9(full, src, 16, stride, 9);\
1882
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1883
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1884
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1885
}\
1886
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887
    uint8_t full[16*9];\
1888
    uint8_t halfH[72];\
1889
    uint8_t halfV[64];\
1890
    uint8_t halfHV[64];\
1891
    copy_block9(full, src, 16, stride, 9);\
1892
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1896
}\
1897
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t full[16*9];\
1899
    uint8_t halfH[72];\
1900
    copy_block9(full, src, 16, stride, 9);\
1901
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1904
}\
1905
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1906
    uint8_t halfH[72];\
1907
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1908
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1909
}\
1910
\
1911
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1912
    uint8_t half[256];\
1913
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1914
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1915
}\
1916
\
1917
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1918
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1919
}\
1920
\
1921
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1922
    uint8_t half[256];\
1923
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1924
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1925
}\
1926
\
1927
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t full[24*17];\
1929
    uint8_t half[256];\
1930
    copy_block17(full, src, 24, stride, 17);\
1931
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1932
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1933
}\
1934
\
1935
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1936
    uint8_t full[24*17];\
1937
    copy_block17(full, src, 24, stride, 17);\
1938
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1939
}\
1940
\
1941
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1942
    uint8_t full[24*17];\
1943
    uint8_t half[256];\
1944
    copy_block17(full, src, 24, stride, 17);\
1945
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1946
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1947
}\
1948
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t full[24*17];\
1950
    uint8_t halfH[272];\
1951
    uint8_t halfV[256];\
1952
    uint8_t halfHV[256];\
1953
    copy_block17(full, src, 24, stride, 17);\
1954
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1955
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1956
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1957
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1958
}\
1959
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1960
    uint8_t full[24*17];\
1961
    uint8_t halfH[272];\
1962
    uint8_t halfHV[256];\
1963
    copy_block17(full, src, 24, stride, 17);\
1964
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1966
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1968
}\
1969
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1970
    uint8_t full[24*17];\
1971
    uint8_t halfH[272];\
1972
    uint8_t halfV[256];\
1973
    uint8_t halfHV[256];\
1974
    copy_block17(full, src, 24, stride, 17);\
1975
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1976
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1977
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1978
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1979
}\
1980
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1981
    uint8_t full[24*17];\
1982
    uint8_t halfH[272];\
1983
    uint8_t halfHV[256];\
1984
    copy_block17(full, src, 24, stride, 17);\
1985
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1987
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1989
}\
1990
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991
    uint8_t full[24*17];\
1992
    uint8_t halfH[272];\
1993
    uint8_t halfV[256];\
1994
    uint8_t halfHV[256];\
1995
    copy_block17(full, src, 24, stride, 17);\
1996
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1997
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1998
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2000
}\
2001
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2002
    uint8_t full[24*17];\
2003
    uint8_t halfH[272];\
2004
    uint8_t halfHV[256];\
2005
    copy_block17(full, src, 24, stride, 17);\
2006
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2007
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2008
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2010
}\
2011
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2012
    uint8_t full[24*17];\
2013
    uint8_t halfH[272];\
2014
    uint8_t halfV[256];\
2015
    uint8_t halfHV[256];\
2016
    copy_block17(full, src, 24, stride, 17);\
2017
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2018
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2019
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2021
}\
2022
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2023
    uint8_t full[24*17];\
2024
    uint8_t halfH[272];\
2025
    uint8_t halfHV[256];\
2026
    copy_block17(full, src, 24, stride, 17);\
2027
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2029
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2031
}\
2032
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2033
    uint8_t halfH[272];\
2034
    uint8_t halfHV[256];\
2035
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2036
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2037
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2038
}\
2039
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2040
    uint8_t halfH[272];\
2041
    uint8_t halfHV[256];\
2042
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2043
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2045
}\
2046
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047
    uint8_t full[24*17];\
2048
    uint8_t halfH[272];\
2049
    uint8_t halfV[256];\
2050
    uint8_t halfHV[256];\
2051
    copy_block17(full, src, 24, stride, 17);\
2052
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2056
}\
2057
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[24*17];\
2059
    uint8_t halfH[272];\
2060
    copy_block17(full, src, 24, stride, 17);\
2061
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2062
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2063
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2064
}\
2065
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2066
    uint8_t full[24*17];\
2067
    uint8_t halfH[272];\
2068
    uint8_t halfV[256];\
2069
    uint8_t halfHV[256];\
2070
    copy_block17(full, src, 24, stride, 17);\
2071
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2072
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2073
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2075
}\
2076
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2077
    uint8_t full[24*17];\
2078
    uint8_t halfH[272];\
2079
    copy_block17(full, src, 24, stride, 17);\
2080
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2082
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2083
}\
2084
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2085
    uint8_t halfH[272];\
2086
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2087
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2088
}
2089

    
2090
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2091
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2092
#define op_put(a, b) a = cm[((b) + 16)>>5]
2093
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2094

    
2095
QPEL_MC(0, put_       , _       , op_put)
2096
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2097
QPEL_MC(0, avg_       , _       , op_avg)
2098
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2099
#undef op_avg
2100
#undef op_avg_no_rnd
2101
#undef op_put
2102
#undef op_put_no_rnd
2103

    
2104
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
2105
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2106
#define put_qpel16_mc00_c ff_put_pixels16x16_c
2107
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2108
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2109
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2110

    
2111
#if 1
2112
#define H264_LOWPASS(OPNAME, OP, OP2) \
2113
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2114
    const int h=2;\
2115
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2116
    int i;\
2117
    for(i=0; i<h; i++)\
2118
    {\
2119
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2120
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2121
        dst+=dstStride;\
2122
        src+=srcStride;\
2123
    }\
2124
}\
2125
\
2126
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2127
    const int w=2;\
2128
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2129
    int i;\
2130
    for(i=0; i<w; i++)\
2131
    {\
2132
        const int srcB= src[-2*srcStride];\
2133
        const int srcA= src[-1*srcStride];\
2134
        const int src0= src[0 *srcStride];\
2135
        const int src1= src[1 *srcStride];\
2136
        const int src2= src[2 *srcStride];\
2137
        const int src3= src[3 *srcStride];\
2138
        const int src4= src[4 *srcStride];\
2139
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2140
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2141
        dst++;\
2142
        src++;\
2143
    }\
2144
}\
2145
\
2146
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2147
    const int h=2;\
2148
    const int w=2;\
2149
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2150
    int i;\
2151
    src -= 2*srcStride;\
2152
    for(i=0; i<h+5; i++)\
2153
    {\
2154
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2155
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2156
        tmp+=tmpStride;\
2157
        src+=srcStride;\
2158
    }\
2159
    tmp -= tmpStride*(h+5-2);\
2160
    for(i=0; i<w; i++)\
2161
    {\
2162
        const int tmpB= tmp[-2*tmpStride];\
2163
        const int tmpA= tmp[-1*tmpStride];\
2164
        const int tmp0= tmp[0 *tmpStride];\
2165
        const int tmp1= tmp[1 *tmpStride];\
2166
        const int tmp2= tmp[2 *tmpStride];\
2167
        const int tmp3= tmp[3 *tmpStride];\
2168
        const int tmp4= tmp[4 *tmpStride];\
2169
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2170
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2171
        dst++;\
2172
        tmp++;\
2173
    }\
2174
}\
2175
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2176
    const int h=4;\
2177
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2178
    int i;\
2179
    for(i=0; i<h; i++)\
2180
    {\
2181
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2182
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2183
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2184
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2185
        dst+=dstStride;\
2186
        src+=srcStride;\
2187
    }\
2188
}\
2189
\
2190
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2191
    const int w=4;\
2192
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2193
    int i;\
2194
    for(i=0; i<w; i++)\
2195
    {\
2196
        const int srcB= src[-2*srcStride];\
2197
        const int srcA= src[-1*srcStride];\
2198
        const int src0= src[0 *srcStride];\
2199
        const int src1= src[1 *srcStride];\
2200
        const int src2= src[2 *srcStride];\
2201
        const int src3= src[3 *srcStride];\
2202
        const int src4= src[4 *srcStride];\
2203
        const int src5= src[5 *srcStride];\
2204
        const int src6= src[6 *srcStride];\
2205
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2206
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2207
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2208
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2209
        dst++;\
2210
        src++;\
2211
    }\
2212
}\
2213
\
2214
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2215
    const int h=4;\
2216
    const int w=4;\
2217
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2218
    int i;\
2219
    src -= 2*srcStride;\
2220
    for(i=0; i<h+5; i++)\
2221
    {\
2222
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2223
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2224
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2225
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2226
        tmp+=tmpStride;\
2227
        src+=srcStride;\
2228
    }\
2229
    tmp -= tmpStride*(h+5-2);\
2230
    for(i=0; i<w; i++)\
2231
    {\
2232
        const int tmpB= tmp[-2*tmpStride];\
2233
        const int tmpA= tmp[-1*tmpStride];\
2234
        const int tmp0= tmp[0 *tmpStride];\
2235
        const int tmp1= tmp[1 *tmpStride];\
2236
        const int tmp2= tmp[2 *tmpStride];\
2237
        const int tmp3= tmp[3 *tmpStride];\
2238
        const int tmp4= tmp[4 *tmpStride];\
2239
        const int tmp5= tmp[5 *tmpStride];\
2240
        const int tmp6= tmp[6 *tmpStride];\
2241
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2242
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2243
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2244
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2245
        dst++;\
2246
        tmp++;\
2247
    }\
2248
}\
2249
\
2250
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2251
    const int h=8;\
2252
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2253
    int i;\
2254
    for(i=0; i<h; i++)\
2255
    {\
2256
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2257
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2258
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2259
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2260
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2261
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2262
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2263
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2264
        dst+=dstStride;\
2265
        src+=srcStride;\
2266
    }\
2267
}\
2268
\
2269
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2270
    const int w=8;\
2271
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2272
    int i;\
2273
    for(i=0; i<w; i++)\
2274
    {\
2275
        const int srcB= src[-2*srcStride];\
2276
        const int srcA= src[-1*srcStride];\
2277
        const int src0= src[0 *srcStride];\
2278
        const int src1= src[1 *srcStride];\
2279
        const int src2= src[2 *srcStride];\
2280
        const int src3= src[3 *srcStride];\
2281
        const int src4= src[4 *srcStride];\
2282
        const int src5= src[5 *srcStride];\
2283
        const int src6= src[6 *srcStride];\
2284
        const int src7= src[7 *srcStride];\
2285
        const int src8= src[8 *srcStride];\
2286
        const int src9= src[9 *srcStride];\
2287
        const int src10=src[10*srcStride];\
2288
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2289
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2290
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2291
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2292
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2293
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2294
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2295
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2296
        dst++;\
2297
        src++;\
2298
    }\
2299
}\
2300
\
2301
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2302
    const int h=8;\
2303
    const int w=8;\
2304
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2305
    int i;\
2306
    src -= 2*srcStride;\
2307
    for(i=0; i<h+5; i++)\
2308
    {\
2309
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2310
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2311
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2312
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2313
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2314
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2315
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2316
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2317
        tmp+=tmpStride;\
2318
        src+=srcStride;\
2319
    }\
2320
    tmp -= tmpStride*(h+5-2);\
2321
    for(i=0; i<w; i++)\
2322
    {\
2323
        const int tmpB= tmp[-2*tmpStride];\
2324
        const int tmpA= tmp[-1*tmpStride];\
2325
        const int tmp0= tmp[0 *tmpStride];\
2326
        const int tmp1= tmp[1 *tmpStride];\
2327
        const int tmp2= tmp[2 *tmpStride];\
2328
        const int tmp3= tmp[3 *tmpStride];\
2329
        const int tmp4= tmp[4 *tmpStride];\
2330
        const int tmp5= tmp[5 *tmpStride];\
2331
        const int tmp6= tmp[6 *tmpStride];\
2332
        const int tmp7= tmp[7 *tmpStride];\
2333
        const int tmp8= tmp[8 *tmpStride];\
2334
        const int tmp9= tmp[9 *tmpStride];\
2335
        const int tmp10=tmp[10*tmpStride];\
2336
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2337
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2338
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2339
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2340
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2341
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2342
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2343
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2344
        dst++;\
2345
        tmp++;\
2346
    }\
2347
}\
2348
\
2349
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2350
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2351
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2352
    src += 8*srcStride;\
2353
    dst += 8*dstStride;\
2354
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2355
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2356
}\
2357
\
2358
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2359
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2360
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2361
    src += 8*srcStride;\
2362
    dst += 8*dstStride;\
2363
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2364
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2365
}\
2366
\
2367
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2368
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2369
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2370
    src += 8*srcStride;\
2371
    dst += 8*dstStride;\
2372
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2373
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2374
}\
2375

    
2376
#define H264_MC(OPNAME, SIZE) \
2377
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2378
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2379
}\
2380
\
2381
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2382
    uint8_t half[SIZE*SIZE];\
2383
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2384
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2385
}\
2386
\
2387
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2388
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2389
}\
2390
\
2391
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2392
    uint8_t half[SIZE*SIZE];\
2393
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2394
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2395
}\
2396
\
2397
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2398
    uint8_t full[SIZE*(SIZE+5)];\
2399
    uint8_t * const full_mid= full + SIZE*2;\
2400
    uint8_t half[SIZE*SIZE];\
2401
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2402
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2403
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2404
}\
2405
\
2406
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2407
    uint8_t full[SIZE*(SIZE+5)];\
2408
    uint8_t * const full_mid= full + SIZE*2;\
2409
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2410
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2411
}\
2412
\
2413
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2414
    uint8_t full[SIZE*(SIZE+5)];\
2415
    uint8_t * const full_mid= full + SIZE*2;\
2416
    uint8_t half[SIZE*SIZE];\
2417
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2418
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2419
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2420
}\
2421
\
2422
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2423
    uint8_t full[SIZE*(SIZE+5)];\
2424
    uint8_t * const full_mid= full + SIZE*2;\
2425
    uint8_t halfH[SIZE*SIZE];\
2426
    uint8_t halfV[SIZE*SIZE];\
2427
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2429
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2431
}\
2432
\
2433
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2434
    uint8_t full[SIZE*(SIZE+5)];\
2435
    uint8_t * const full_mid= full + SIZE*2;\
2436
    uint8_t halfH[SIZE*SIZE];\
2437
    uint8_t halfV[SIZE*SIZE];\
2438
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2439
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2440
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2442
}\
2443
\
2444
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2445
    uint8_t full[SIZE*(SIZE+5)];\
2446
    uint8_t * const full_mid= full + SIZE*2;\
2447
    uint8_t halfH[SIZE*SIZE];\
2448
    uint8_t halfV[SIZE*SIZE];\
2449
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2451
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2453
}\
2454
\
2455
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2456
    uint8_t full[SIZE*(SIZE+5)];\
2457
    uint8_t * const full_mid= full + SIZE*2;\
2458
    uint8_t halfH[SIZE*SIZE];\
2459
    uint8_t halfV[SIZE*SIZE];\
2460
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2461
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2462
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2464
}\
2465
\
2466
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2467
    int16_t tmp[SIZE*(SIZE+5)];\
2468
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2469
}\
2470
\
2471
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2472
    int16_t tmp[SIZE*(SIZE+5)];\
2473
    uint8_t halfH[SIZE*SIZE];\
2474
    uint8_t halfHV[SIZE*SIZE];\
2475
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2476
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2477
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2478
}\
2479
\
2480
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2481
    int16_t tmp[SIZE*(SIZE+5)];\
2482
    uint8_t halfH[SIZE*SIZE];\
2483
    uint8_t halfHV[SIZE*SIZE];\
2484
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2485
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2486
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2487
}\
2488
\
2489
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2490
    uint8_t full[SIZE*(SIZE+5)];\
2491
    uint8_t * const full_mid= full + SIZE*2;\
2492
    int16_t tmp[SIZE*(SIZE+5)];\
2493
    uint8_t halfV[SIZE*SIZE];\
2494
    uint8_t halfHV[SIZE*SIZE];\
2495
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2496
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2497
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2498
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2499
}\
2500
\
2501
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2502
    uint8_t full[SIZE*(SIZE+5)];\
2503
    uint8_t * const full_mid= full + SIZE*2;\
2504
    int16_t tmp[SIZE*(SIZE+5)];\
2505
    uint8_t halfV[SIZE*SIZE];\
2506
    uint8_t halfHV[SIZE*SIZE];\
2507
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2508
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2509
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2510
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2511
}\
2512

    
2513
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2514
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2515
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2516
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2517
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2518

    
2519
H264_LOWPASS(put_       , op_put, op2_put)
2520
H264_LOWPASS(avg_       , op_avg, op2_avg)
2521
H264_MC(put_, 2)
2522
H264_MC(put_, 4)
2523
H264_MC(put_, 8)
2524
H264_MC(put_, 16)
2525
H264_MC(avg_, 4)
2526
H264_MC(avg_, 8)
2527
H264_MC(avg_, 16)
2528

    
2529
#undef op_avg
2530
#undef op_put
2531
#undef op2_avg
2532
#undef op2_put
2533
#endif
2534

    
2535
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2536
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2537
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2538
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2539

    
2540
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2541
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2542
    int i;
2543

    
2544
    for(i=0; i<h; i++){
2545
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2546
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2547
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2548
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2549
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2550
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2551
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2552
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2553
        dst+=dstStride;
2554
        src+=srcStride;
2555
    }
2556
}
2557

    
2558
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2559
    put_pixels8_c(dst, src, stride, 8);
2560
}
2561
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2562
    avg_pixels8_c(dst, src, stride, 8);
2563
}
2564
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2565
    put_pixels16_c(dst, src, stride, 16);
2566
}
2567
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2568
    avg_pixels16_c(dst, src, stride, 16);
2569
}
2570

    
2571
#if CONFIG_RV40_DECODER
2572
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2573
    put_pixels16_xy2_c(dst, src, stride, 16);
2574
}
2575
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2576
    avg_pixels16_xy2_c(dst, src, stride, 16);
2577
}
2578
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2579
    put_pixels8_xy2_c(dst, src, stride, 8);
2580
}
2581
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2582
    avg_pixels8_xy2_c(dst, src, stride, 8);
2583
}
2584
#endif /* CONFIG_RV40_DECODER */
2585

    
2586
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2587
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2588
    int i;
2589

    
2590
    for(i=0; i<w; i++){
2591
        const int src_1= src[ -srcStride];
2592
        const int src0 = src[0          ];
2593
        const int src1 = src[  srcStride];
2594
        const int src2 = src[2*srcStride];
2595
        const int src3 = src[3*srcStride];
2596
        const int src4 = src[4*srcStride];
2597
        const int src5 = src[5*srcStride];
2598
        const int src6 = src[6*srcStride];
2599
        const int src7 = src[7*srcStride];
2600
        const int src8 = src[8*srcStride];
2601
        const int src9 = src[9*srcStride];
2602
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2603
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2604
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2605
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2606
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2607
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2608
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2609
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2610
        src++;
2611
        dst++;
2612
    }
2613
}
2614

    
2615
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2616
    uint8_t half[64];
2617
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2618
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2619
}
2620

    
2621
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2622
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2623
}
2624

    
2625
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2626
    uint8_t half[64];
2627
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2628
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2629
}
2630

    
2631
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2632
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2633
}
2634

    
2635
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2636
    uint8_t halfH[88];
2637
    uint8_t halfV[64];
2638
    uint8_t halfHV[64];
2639
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2641
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2642
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2643
}
2644
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2645
    uint8_t halfH[88];
2646
    uint8_t halfV[64];
2647
    uint8_t halfHV[64];
2648
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2649
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2650
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2651
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2652
}
2653
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2654
    uint8_t halfH[88];
2655
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2657
}
2658

    
2659
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2660
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2661
    int x;
2662
    const int strength= ff_h263_loop_filter_strength[qscale];
2663

    
2664
    for(x=0; x<8; x++){
2665
        int d1, d2, ad1;
2666
        int p0= src[x-2*stride];
2667
        int p1= src[x-1*stride];
2668
        int p2= src[x+0*stride];
2669
        int p3= src[x+1*stride];
2670
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2671

    
2672
        if     (d<-2*strength) d1= 0;
2673
        else if(d<-  strength) d1=-2*strength - d;
2674
        else if(d<   strength) d1= d;
2675
        else if(d< 2*strength) d1= 2*strength - d;
2676
        else                   d1= 0;
2677

    
2678
        p1 += d1;
2679
        p2 -= d1;
2680
        if(p1&256) p1= ~(p1>>31);
2681
        if(p2&256) p2= ~(p2>>31);
2682

    
2683
        src[x-1*stride] = p1;
2684
        src[x+0*stride] = p2;
2685

    
2686
        ad1= FFABS(d1)>>1;
2687

    
2688
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2689

    
2690
        src[x-2*stride] = p0 - d2;
2691
        src[x+  stride] = p3 + d2;
2692
    }
2693
    }
2694
}
2695

    
2696
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2697
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2698
    int y;
2699
    const int strength= ff_h263_loop_filter_strength[qscale];
2700

    
2701
    for(y=0; y<8; y++){
2702
        int d1, d2, ad1;
2703
        int p0= src[y*stride-2];
2704
        int p1= src[y*stride-1];
2705
        int p2= src[y*stride+0];
2706
        int p3= src[y*stride+1];
2707
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2708

    
2709
        if     (d<-2*strength) d1= 0;
2710
        else if(d<-  strength) d1=-2*strength - d;
2711
        else if(d<   strength) d1= d;
2712
        else if(d< 2*strength) d1= 2*strength - d;
2713
        else                   d1= 0;
2714

    
2715
        p1 += d1;
2716
        p2 -= d1;
2717
        if(p1&256) p1= ~(p1>>31);
2718
        if(p2&256) p2= ~(p2>>31);
2719

    
2720
        src[y*stride-1] = p1;
2721
        src[y*stride+0] = p2;
2722

    
2723
        ad1= FFABS(d1)>>1;
2724

    
2725
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2726

    
2727
        src[y*stride-2] = p0 - d2;
2728
        src[y*stride+1] = p3 + d2;
2729
    }
2730
    }
2731
}
2732

    
2733
static void h261_loop_filter_c(uint8_t *src, int stride){
2734
    int x,y,xy,yz;
2735
    int temp[64];
2736

    
2737
    for(x=0; x<8; x++){
2738
        temp[x      ] = 4*src[x           ];
2739
        temp[x + 7*8] = 4*src[x + 7*stride];
2740
    }
2741
    for(y=1; y<7; y++){
2742
        for(x=0; x<8; x++){
2743
            xy = y * stride + x;
2744
            yz = y * 8 + x;
2745
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2746
        }
2747
    }
2748

    
2749
    for(y=0; y<8; y++){
2750
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2751
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2752
        for(x=1; x<7; x++){
2753
            xy = y * stride + x;
2754
            yz = y * 8 + x;
2755
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2756
        }
2757
    }
2758
}
2759

    
2760
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2761
{
2762
    int s, i;
2763

    
2764
    s = 0;
2765
    for(i=0;i<h;i++) {
2766
        s += abs(pix1[0] - pix2[0]);
2767
        s += abs(pix1[1] - pix2[1]);
2768
        s += abs(pix1[2] - pix2[2]);
2769
        s += abs(pix1[3] - pix2[3]);
2770
        s += abs(pix1[4] - pix2[4]);
2771
        s += abs(pix1[5] - pix2[5]);
2772
        s += abs(pix1[6] - pix2[6]);
2773
        s += abs(pix1[7] - pix2[7]);
2774
        s += abs(pix1[8] - pix2[8]);
2775
        s += abs(pix1[9] - pix2[9]);
2776
        s += abs(pix1[10] - pix2[10]);
2777
        s += abs(pix1[11] - pix2[11]);
2778
        s += abs(pix1[12] - pix2[12]);
2779
        s += abs(pix1[13] - pix2[13]);
2780
        s += abs(pix1[14] - pix2[14]);
2781
        s += abs(pix1[15] - pix2[15]);
2782
        pix1 += line_size;
2783
        pix2 += line_size;
2784
    }
2785
    return s;
2786
}
2787

    
2788
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2789
{
2790
    int s, i;
2791

    
2792
    s = 0;
2793
    for(i=0;i<h;i++) {
2794
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2795
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2796
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2797
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2798
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2799
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2800
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2801
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2802
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2803
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2804
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2805
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2806
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2807
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2808
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2809
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2810
        pix1 += line_size;
2811
        pix2 += line_size;
2812
    }
2813
    return s;
2814
}
2815

    
2816
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2817
{
2818
    int s, i;
2819
    uint8_t *pix3 = pix2 + line_size;
2820

    
2821
    s = 0;
2822
    for(i=0;i<h;i++) {
2823
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2824
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2825
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2826
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2827
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2828
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2829
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2830
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2831
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2832
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2833
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2834
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2835
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2836
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2837
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2838
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2839
        pix1 += line_size;
2840
        pix2 += line_size;
2841
        pix3 += line_size;
2842
    }
2843
    return s;
2844
}
2845

    
2846
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2847
{
2848
    int s, i;
2849
    uint8_t *pix3 = pix2 + line_size;
2850

    
2851
    s = 0;
2852
    for(i=0;i<h;i++) {
2853
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2854
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2855
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2856
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2857
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2858
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2859
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2860
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2861
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2862
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2863
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2864
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2865
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2866
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2867
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2868
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2869
        pix1 += line_size;
2870
        pix2 += line_size;
2871
        pix3 += line_size;
2872
    }
2873
    return s;
2874
}
2875

    
2876
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877
{
2878
    int s, i;
2879

    
2880
    s = 0;
2881
    for(i=0;i<h;i++) {
2882
        s += abs(pix1[0] - pix2[0]);
2883
        s += abs(pix1[1] - pix2[1]);
2884
        s += abs(pix1[2] - pix2[2]);
2885
        s += abs(pix1[3] - pix2[3]);
2886
        s += abs(pix1[4] - pix2[4]);
2887
        s += abs(pix1[5] - pix2[5]);
2888
        s += abs(pix1[6] - pix2[6]);
2889
        s += abs(pix1[7] - pix2[7]);
2890
        pix1 += line_size;
2891
        pix2 += line_size;
2892
    }
2893
    return s;
2894
}
2895

    
2896
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2897
{
2898
    int s, i;
2899

    
2900
    s = 0;
2901
    for(i=0;i<h;i++) {
2902
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2903
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2904
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2905
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2906
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2907
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2908
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2909
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2910
        pix1 += line_size;
2911
        pix2 += line_size;
2912
    }
2913
    return s;
2914
}
2915

    
2916
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917
{
2918
    int s, i;
2919
    uint8_t *pix3 = pix2 + line_size;
2920

    
2921
    s = 0;
2922
    for(i=0;i<h;i++) {
2923
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2924
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2925
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2926
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2927
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2928
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2929
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2930
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2931
        pix1 += line_size;
2932
        pix2 += line_size;
2933
        pix3 += line_size;
2934
    }
2935
    return s;
2936
}
2937

    
2938
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2939
{
2940
    int s, i;
2941
    uint8_t *pix3 = pix2 + line_size;
2942

    
2943
    s = 0;
2944
    for(i=0;i<h;i++) {
2945
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2946
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2947
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2948
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2949
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2950
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2951
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2952
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2953
        pix1 += line_size;
2954
        pix2 += line_size;
2955
        pix3 += line_size;
2956
    }
2957
    return s;
2958
}
2959

    
2960
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2961
    MpegEncContext *c = v;
2962
    int score1=0;
2963
    int score2=0;
2964
    int x,y;
2965

    
2966
    for(y=0; y<h; y++){
2967
        for(x=0; x<16; x++){
2968
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2969
        }
2970
        if(y+1<h){
2971
            for(x=0; x<15; x++){
2972
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2973
                             - s1[x+1] + s1[x+1+stride])
2974
                        -FFABS(  s2[x  ] - s2[x  +stride]
2975
                             - s2[x+1] + s2[x+1+stride]);
2976
            }
2977
        }
2978
        s1+= stride;
2979
        s2+= stride;
2980
    }
2981

    
2982
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2983
    else  return score1 + FFABS(score2)*8;
2984
}
2985

    
2986
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2987
    MpegEncContext *c = v;
2988
    int score1=0;
2989
    int score2=0;
2990
    int x,y;
2991

    
2992
    for(y=0; y<h; y++){
2993
        for(x=0; x<8; x++){
2994
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2995
        }
2996
        if(y+1<h){
2997
            for(x=0; x<7; x++){
2998
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2999
                             - s1[x+1] + s1[x+1+stride])
3000
                        -FFABS(  s2[x  ] - s2[x  +stride]
3001
                             - s2[x+1] + s2[x+1+stride]);
3002
            }
3003
        }
3004
        s1+= stride;
3005
        s2+= stride;
3006
    }
3007

    
3008
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3009
    else  return score1 + FFABS(score2)*8;
3010
}
3011

    
3012
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3013
    int i;
3014
    unsigned int sum=0;
3015

    
3016
    for(i=0; i<8*8; i++){
3017
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3018
        int w= weight[i];
3019
        b>>= RECON_SHIFT;
3020
        assert(-512<b && b<512);
3021

    
3022
        sum += (w*b)*(w*b)>>4;
3023
    }
3024
    return sum>>2;
3025
}
3026

    
3027
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3028
    int i;
3029

    
3030
    for(i=0; i<8*8; i++){
3031
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3032
    }
3033
}
3034

    
3035
/**
3036
 * permutes an 8x8 block.
3037
 * @param block the block which will be permuted according to the given permutation vector
3038
 * @param permutation the permutation vector
3039
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3040
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3041
 *                  (inverse) permutated to scantable order!
3042
 */
3043
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3044
{
3045
    int i;
3046
    DCTELEM temp[64];
3047

    
3048
    if(last<=0) return;
3049
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3050

    
3051
    for(i=0; i<=last; i++){
3052
        const int j= scantable[i];
3053
        temp[j]= block[j];
3054
        block[j]=0;
3055
    }
3056

    
3057
    for(i=0; i<=last; i++){
3058
        const int j= scantable[i];
3059
        const int perm_j= permutation[j];
3060
        block[perm_j]= temp[j];
3061
    }
3062
}
3063

    
3064
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3065
    return 0;
3066
}
3067

    
3068
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3069
    int i;
3070

    
3071
    memset(cmp, 0, sizeof(void*)*6);
3072

    
3073
    for(i=0; i<6; i++){
3074
        switch(type&0xFF){
3075
        case FF_CMP_SAD:
3076
            cmp[i]= c->sad[i];
3077
            break;
3078
        case FF_CMP_SATD:
3079
            cmp[i]= c->hadamard8_diff[i];
3080
            break;
3081
        case FF_CMP_SSE:
3082
            cmp[i]= c->sse[i];
3083
            break;
3084
        case FF_CMP_DCT:
3085
            cmp[i]= c->dct_sad[i];
3086
            break;
3087
        case FF_CMP_DCT264:
3088
            cmp[i]= c->dct264_sad[i];
3089
            break;
3090
        case FF_CMP_DCTMAX:
3091
            cmp[i]= c->dct_max[i];
3092
            break;
3093
        case FF_CMP_PSNR:
3094
            cmp[i]= c->quant_psnr[i];
3095
            break;
3096
        case FF_CMP_BIT:
3097
            cmp[i]= c->bit[i];
3098
            break;
3099
        case FF_CMP_RD:
3100
            cmp[i]= c->rd[i];
3101
            break;
3102
        case FF_CMP_VSAD:
3103
            cmp[i]= c->vsad[i];
3104
            break;
3105
        case FF_CMP_VSSE:
3106
            cmp[i]= c->vsse[i];
3107
            break;
3108
        case FF_CMP_ZERO:
3109
            cmp[i]= zero_cmp;
3110
            break;
3111
        case FF_CMP_NSSE:
3112
            cmp[i]= c->nsse[i];
3113
            break;
3114
#if CONFIG_DWT
3115
        case FF_CMP_W53:
3116
            cmp[i]= c->w53[i];
3117
            break;
3118
        case FF_CMP_W97:
3119
            cmp[i]= c->w97[i];
3120
            break;
3121
#endif
3122
        default:
3123
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3124
        }
3125
    }
3126
}
3127

    
3128
static void clear_block_c(DCTELEM *block)
3129
{
3130
    memset(block, 0, sizeof(DCTELEM)*64);
3131
}
3132

    
3133
/**
3134
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3135
 */
3136
static void clear_blocks_c(DCTELEM *blocks)
3137
{
3138
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3139
}
3140

    
3141
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3142
    long i;
3143
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3144
        long a = *(long*)(src+i);
3145
        long b = *(long*)(dst+i);
3146
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3147
    }
3148
    for(; i<w; i++)
3149
        dst[i+0] += src[i+0];
3150
}
3151

    
3152
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3153
    long i;
3154
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3155
        long a = *(long*)(src1+i);
3156
        long b = *(long*)(src2+i);
3157
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3158
    }
3159
    for(; i<w; i++)
3160
        dst[i] = src1[i]+src2[i];
3161
}
3162

    
3163
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3164
    long i;
3165
#if !HAVE_FAST_UNALIGNED
3166
    if((long)src2 & (sizeof(long)-1)){
3167
        for(i=0; i+7<w; i+=8){
3168
            dst[i+0] = src1[i+0]-src2[i+0];
3169
            dst[i+1] = src1[i+1]-src2[i+1];
3170
            dst[i+2] = src1[i+2]-src2[i+2];
3171
            dst[i+3] = src1[i+3]-src2[i+3];
3172
            dst[i+4] = src1[i+4]-src2[i+4];
3173
            dst[i+5] = src1[i+5]-src2[i+5];
3174
            dst[i+6] = src1[i+6]-src2[i+6];
3175
            dst[i+7] = src1[i+7]-src2[i+7];
3176
        }
3177
    }else
3178
#endif
3179
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3180
        long a = *(long*)(src1+i);
3181
        long b = *(long*)(src2+i);
3182
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3183
    }
3184
    for(; i<w; i++)
3185
        dst[i+0] = src1[i+0]-src2[i+0];
3186
}
3187

    
3188
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3189
    int i;
3190
    uint8_t l, lt;
3191

    
3192
    l= *left;
3193
    lt= *left_top;
3194

    
3195
    for(i=0; i<w; i++){
3196
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3197
        lt= src1[i];
3198
        dst[i]= l;
3199
    }
3200

    
3201
    *left= l;
3202
    *left_top= lt;
3203
}
3204

    
3205
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3206
    int i;
3207
    uint8_t l, lt;
3208

    
3209
    l= *left;
3210
    lt= *left_top;
3211

    
3212
    for(i=0; i<w; i++){
3213
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3214
        lt= src1[i];
3215
        l= src2[i];
3216
        dst[i]= l - pred;
3217
    }
3218

    
3219
    *left= l;
3220
    *left_top= lt;
3221
}
3222

    
3223
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3224
    int i;
3225

    
3226
    for(i=0; i<w-1; i++){
3227
        acc+= src[i];
3228
        dst[i]= acc;
3229
        i++;
3230
        acc+= src[i];
3231
        dst[i]= acc;
3232
    }
3233

    
3234
    for(; i<w; i++){
3235
        acc+= src[i];
3236
        dst[i]= acc;
3237
    }
3238

    
3239
    return acc;
3240
}
3241

    
3242
#if HAVE_BIGENDIAN
3243
#define B 3
3244
#define G 2
3245
#define R 1
3246
#define A 0
3247
#else
3248
#define B 0
3249
#define G 1
3250
#define R 2
3251
#define A 3
3252
#endif
3253
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3254
    int i;
3255
    int r,g,b,a;
3256
    r= *red;
3257
    g= *green;
3258
    b= *blue;
3259
    a= *alpha;
3260

    
3261
    for(i=0; i<w; i++){
3262
        b+= src[4*i+B];
3263
        g+= src[4*i+G];
3264
        r+= src[4*i+R];
3265
        a+= src[4*i+A];
3266

    
3267
        dst[4*i+B]= b;
3268
        dst[4*i+G]= g;
3269
        dst[4*i+R]= r;
3270
        dst[4*i+A]= a;
3271
    }
3272

    
3273
    *red= r;
3274
    *green= g;
3275
    *blue= b;
3276
    *alpha= a;
3277
}
3278
#undef B
3279
#undef G
3280
#undef R
3281
#undef A
3282

    
3283
#define BUTTERFLY2(o1,o2,i1,i2) \
3284
o1= (i1)+(i2);\
3285
o2= (i1)-(i2);
3286

    
3287
#define BUTTERFLY1(x,y) \
3288
{\
3289
    int a,b;\
3290
    a= x;\
3291
    b= y;\
3292
    x= a+b;\
3293
    y= a-b;\
3294
}
3295

    
3296
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3297

    
3298
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3299
    int i;
3300
    int temp[64];
3301
    int sum=0;
3302

    
3303
    assert(h==8);
3304

    
3305
    for(i=0; i<8; i++){
3306
        //FIXME try pointer walks
3307
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3308
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3309
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3310
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3311

    
3312
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3313
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3314
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3315
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3316

    
3317
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3318
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3319
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3320
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3321
    }
3322

    
3323
    for(i=0; i<8; i++){
3324
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3325
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3326
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3327
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3328

    
3329
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3330
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3331
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3332
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3333

    
3334
        sum +=
3335
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3336
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3337
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3338
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3339
    }
3340
#if 0
3341
static int maxi=0;
3342
if(sum>maxi){
3343
    maxi=sum;
3344
    printf("MAX:%d\n", maxi);
3345
}
3346
#endif
3347
    return sum;
3348
}
3349

    
3350
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3351
    int i;
3352
    int temp[64];
3353
    int sum=0;
3354

    
3355
    assert(h==8);
3356

    
3357
    for(i=0; i<8; i++){
3358
        //FIXME try pointer walks
3359
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3360
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3361
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3362
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3363

    
3364
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3365
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3366
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3367
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3368

    
3369
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3370
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3371
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3372
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3373
    }
3374

    
3375
    for(i=0; i<8; i++){
3376
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3377
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3378
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3379
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3380

    
3381
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3382
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3383
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3384
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3385

    
3386
        sum +=
3387
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3388
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3389
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3390
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3391
    }
3392

    
3393
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3394

    
3395
    return sum;
3396
}
3397

    
3398
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3399
    MpegEncContext * const s= (MpegEncContext *)c;
3400
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3401

    
3402
    assert(h==8);
3403

    
3404
    s->dsp.diff_pixels(temp, src1, src2, stride);
3405
    s->dsp.fdct(temp);
3406
    return s->dsp.sum_abs_dctelem(temp);
3407
}
3408

    
3409
#if CONFIG_GPL
3410
#define DCT8_1D {\
3411
    const int s07 = SRC(0) + SRC(7);\
3412
    const int s16 = SRC(1) + SRC(6);\
3413
    const int s25 = SRC(2) + SRC(5);\
3414
    const int s34 = SRC(3) + SRC(4);\
3415
    const int a0 = s07 + s34;\
3416
    const int a1 = s16 + s25;\
3417
    const int a2 = s07 - s34;\
3418
    const int a3 = s16 - s25;\
3419
    const int d07 = SRC(0) - SRC(7);\
3420
    const int d16 = SRC(1) - SRC(6);\
3421
    const int d25 = SRC(2) - SRC(5);\
3422
    const int d34 = SRC(3) - SRC(4);\
3423
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3424
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3425
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3426
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3427
    DST(0,  a0 + a1     ) ;\
3428
    DST(1,  a4 + (a7>>2)) ;\
3429
    DST(2,  a2 + (a3>>1)) ;\
3430
    DST(3,  a5 + (a6>>2)) ;\
3431
    DST(4,  a0 - a1     ) ;\
3432
    DST(5,  a6 - (a5>>2)) ;\
3433
    DST(6, (a2>>1) - a3 ) ;\
3434
    DST(7, (a4>>2) - a7 ) ;\
3435
}
3436

    
3437
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3438
    MpegEncContext * const s= (MpegEncContext *)c;
3439
    DCTELEM dct[8][8];
3440
    int i;
3441
    int sum=0;
3442

    
3443
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3444

    
3445
#define SRC(x) dct[i][x]
3446
#define DST(x,v) dct[i][x]= v
3447
    for( i = 0; i < 8; i++ )
3448
        DCT8_1D
3449
#undef SRC
3450
#undef DST
3451

    
3452
#define SRC(x) dct[x][i]
3453
#define DST(x,v) sum += FFABS(v)
3454
    for( i = 0; i < 8; i++ )
3455
        DCT8_1D
3456
#undef SRC
3457
#undef DST
3458
    return sum;
3459
}
3460
#endif
3461

    
3462
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3463
    MpegEncContext * const s= (MpegEncContext *)c;
3464
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3465
    int sum=0, i;
3466

    
3467
    assert(h==8);
3468

    
3469
    s->dsp.diff_pixels(temp, src1, src2, stride);
3470
    s->dsp.fdct(temp);
3471

    
3472
    for(i=0; i<64; i++)
3473
        sum= FFMAX(sum, FFABS(temp[i]));
3474

    
3475
    return sum;
3476
}
3477

    
3478
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3479
    MpegEncContext * const s= (MpegEncContext *)c;
3480
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3481
    DCTELEM * const bak = temp+64;
3482
    int sum=0, i;
3483

    
3484
    assert(h==8);
3485
    s->mb_intra=0;
3486

    
3487
    s->dsp.diff_pixels(temp, src1, src2, stride);
3488

    
3489
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3490

    
3491
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3492
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3493
    ff_simple_idct(temp); //FIXME
3494

    
3495
    for(i=0; i<64; i++)
3496
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3497

    
3498
    return sum;
3499
}
3500

    
3501
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502
    MpegEncContext * const s= (MpegEncContext *)c;
3503
    const uint8_t *scantable= s->intra_scantable.permutated;
3504
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3505
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3506
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3507
    int i, last, run, bits, level, distortion, start_i;
3508
    const int esc_length= s->ac_esc_length;
3509
    uint8_t * length;
3510
    uint8_t * last_length;
3511

    
3512
    assert(h==8);
3513

    
3514
    copy_block8(lsrc1, src1, 8, stride, 8);
3515
    copy_block8(lsrc2, src2, 8, stride, 8);
3516

    
3517
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3518

    
3519
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3520

    
3521
    bits=0;
3522

    
3523
    if (s->mb_intra) {
3524
        start_i = 1;
3525
        length     = s->intra_ac_vlc_length;
3526
        last_length= s->intra_ac_vlc_last_length;
3527
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3528
    } else {
3529
        start_i = 0;
3530
        length     = s->inter_ac_vlc_length;
3531
        last_length= s->inter_ac_vlc_last_length;
3532
    }
3533

    
3534
    if(last>=start_i){
3535
        run=0;
3536
        for(i=start_i; i<last; i++){
3537
            int j= scantable[i];
3538
            level= temp[j];
3539

    
3540
            if(level){
3541
                level+=64;
3542
                if((level&(~127)) == 0){
3543
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3544
                }else
3545
                    bits+= esc_length;
3546
                run=0;
3547
            }else
3548
                run++;
3549
        }
3550
        i= scantable[last];
3551

    
3552
        level= temp[i] + 64;
3553

    
3554
        assert(level - 64);
3555

    
3556
        if((level&(~127)) == 0){
3557
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3558
        }else
3559
            bits+= esc_length;
3560

    
3561
    }
3562

    
3563
    if(last>=0){
3564
        if(s->mb_intra)
3565
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3566
        else
3567
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3568
    }
3569

    
3570
    s->dsp.idct_add(lsrc2, 8, temp);
3571

    
3572
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3573

    
3574
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3575
}
3576

    
3577
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3578
    MpegEncContext * const s= (MpegEncContext *)c;
3579
    const uint8_t *scantable= s->intra_scantable.permutated;
3580
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3581
    int i, last, run, bits, level, start_i;
3582
    const int esc_length= s->ac_esc_length;
3583
    uint8_t * length;
3584
    uint8_t * last_length;
3585

    
3586
    assert(h==8);
3587

    
3588
    s->dsp.diff_pixels(temp, src1, src2, stride);
3589

    
3590
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591

    
3592
    bits=0;
3593

    
3594
    if (s->mb_intra) {
3595
        start_i = 1;
3596
        length     = s->intra_ac_vlc_length;
3597
        last_length= s->intra_ac_vlc_last_length;
3598
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599
    } else {
3600
        start_i = 0;
3601
        length     = s->inter_ac_vlc_length;
3602
        last_length= s->inter_ac_vlc_last_length;
3603
    }
3604

    
3605
    if(last>=start_i){
3606
        run=0;
3607
        for(i=start_i; i<last; i++){
3608
            int j= scantable[i];
3609
            level= temp[j];
3610

    
3611
            if(level){
3612
                level+=64;
3613
                if((level&(~127)) == 0){
3614
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615
                }else
3616
                    bits+= esc_length;
3617
                run=0;
3618
            }else
3619
                run++;
3620
        }
3621
        i= scantable[last];
3622

    
3623
        level= temp[i] + 64;
3624

    
3625
        assert(level - 64);
3626

    
3627
        if((level&(~127)) == 0){
3628
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629
        }else
3630
            bits+= esc_length;
3631
    }
3632

    
3633
    return bits;
3634
}
3635

    
3636
#define VSAD_INTRA(size) \
3637
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3638
    int score=0;                                                                                            \
3639
    int x,y;                                                                                                \
3640
                                                                                                            \
3641
    for(y=1; y<h; y++){                                                                                     \
3642
        for(x=0; x<size; x+=4){                                                                             \
3643
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3644
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3645
        }                                                                                                   \
3646
        s+= stride;                                                                                         \
3647
    }                                                                                                       \
3648
                                                                                                            \
3649
    return score;                                                                                           \
3650
}
3651
VSAD_INTRA(8)
3652
VSAD_INTRA(16)
3653

    
3654
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3655
    int score=0;
3656
    int x,y;
3657

    
3658
    for(y=1; y<h; y++){
3659
        for(x=0; x<16; x++){
3660
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3661
        }
3662
        s1+= stride;
3663
        s2+= stride;
3664
    }
3665

    
3666
    return score;
3667
}
3668

    
3669
#define SQ(a) ((a)*(a))
3670
#define VSSE_INTRA(size) \
3671
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3672
    int score=0;                                                                                            \
3673
    int x,y;                                                                                                \
3674
                                                                                                            \
3675
    for(y=1; y<h; y++){                                                                                     \
3676
        for(x=0; x<size; x+=4){                                                                               \
3677
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3678
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3679
        }                                                                                                   \
3680
        s+= stride;                                                                                         \
3681
    }                                                                                                       \
3682
                                                                                                            \
3683
    return score;                                                                                           \
3684
}
3685
VSSE_INTRA(8)
3686
VSSE_INTRA(16)
3687

    
3688
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3689
    int score=0;
3690
    int x,y;
3691

    
3692
    for(y=1; y<h; y++){
3693
        for(x=0; x<16; x++){
3694
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3695
        }
3696
        s1+= stride;
3697
        s2+= stride;
3698
    }
3699

    
3700
    return score;
3701
}
3702

    
3703
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3704
                               int size){
3705
    int score=0;
3706
    int i;
3707
    for(i=0; i<size; i++)
3708
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3709
    return score;
3710
}
3711

    
3712
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3713
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3714
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3715
#if CONFIG_GPL
3716
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3717
#endif
3718
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3719
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3720
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3721
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3722

    
3723
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3724
    int i;
3725
    for(i=0; i<len; i++)
3726
        dst[i] = src0[i] * src1[i];
3727
}
3728

    
3729
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3730
    int i;
3731
    src1 += len-1;
3732
    for(i=0; i<len; i++)
3733
        dst[i] = src0[i] * src1[-i];
3734
}
3735

    
3736
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3737
    int i;
3738
    for(i=0; i<len; i++)
3739
        dst[i] = src0[i] * src1[i] + src2[i];
3740
}
3741

    
3742
static void vector_fmul_window_c(float *dst, const float *src0,
3743
                                 const float *src1, const float *win, int len)
3744
{
3745
    int i,j;
3746
    dst += len;
3747
    win += len;
3748
    src0+= len;
3749
    for(i=-len, j=len-1; i<0; i++, j--) {
3750
        float s0 = src0[i];
3751
        float s1 = src1[j];
3752
        float wi = win[i];
3753
        float wj = win[j];
3754
        dst[i] = s0*wj - s1*wi;
3755
        dst[j] = s0*wi + s1*wj;
3756
    }
3757
}
3758

    
3759
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3760
                                 int len)
3761
{
3762
    int i;
3763
    for (i = 0; i < len; i++)
3764
        dst[i] = src[i] * mul;
3765
}
3766

    
3767
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3768
                                      const float **sv, float mul, int len)
3769
{
3770
    int i;
3771
    for (i = 0; i < len; i += 2, sv++) {
3772
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3773
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3774
    }
3775
}
3776

    
3777
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3778
                                      const float **sv, float mul, int len)
3779
{
3780
    int i;
3781
    for (i = 0; i < len; i += 4, sv++) {
3782
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3783
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3784
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3785
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3786
    }
3787
}
3788

    
3789
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3790
                               int len)
3791
{
3792
    int i;
3793
    for (i = 0; i < len; i += 2, sv++) {
3794
        dst[i  ] = sv[0][0] * mul;
3795
        dst[i+1] = sv[0][1] * mul;
3796
    }
3797
}
3798

    
3799
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3800
                               int len)
3801
{
3802
    int i;
3803
    for (i = 0; i < len; i += 4, sv++) {
3804
        dst[i  ] = sv[0][0] * mul;
3805
        dst[i+1] = sv[0][1] * mul;
3806
        dst[i+2] = sv[0][2] * mul;
3807
        dst[i+3] = sv[0][3] * mul;
3808
    }
3809
}
3810

    
3811
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3812
                                int len)
3813
{
3814
    int i;
3815
    for (i = 0; i < len; i++) {
3816
        float t = v1[i] - v2[i];
3817
        v1[i] += v2[i];
3818
        v2[i] = t;
3819
    }
3820
}
3821

    
3822
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3823
{
3824
    float p = 0.0;
3825
    int i;
3826

    
3827
    for (i = 0; i < len; i++)
3828
        p += v1[i] * v2[i];
3829

    
3830
    return p;
3831
}
3832

    
3833
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3834
                   uint32_t maxi, uint32_t maxisign)
3835
{
3836

    
3837
    if(a > mini) return mini;
3838
    else if((a^(1<<31)) > maxisign) return maxi;
3839
    else return a;
3840
}
3841

    
3842
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3843
    int i;
3844
    uint32_t mini = *(uint32_t*)min;
3845
    uint32_t maxi = *(uint32_t*)max;
3846
    uint32_t maxisign = maxi ^ (1<<31);
3847
    uint32_t *dsti = (uint32_t*)dst;
3848
    const uint32_t *srci = (const uint32_t*)src;
3849
    for(i=0; i<len; i+=8) {
3850
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3851
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3852
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3853
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3854
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3855
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3856
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3857
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3858
    }
3859
}
3860
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3861
    int i;
3862
    if(min < 0 && max > 0) {
3863
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3864
    } else {
3865
        for(i=0; i < len; i+=8) {
3866
            dst[i    ] = av_clipf(src[i    ], min, max);
3867
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3868
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3869
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3870
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3871
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3872
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3873
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3874
        }
3875
    }
3876
}
3877

    
3878
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3879
{
3880
    int res = 0;
3881

    
3882
    while (order--)
3883
        res += (*v1++ * *v2++) >> shift;
3884

    
3885
    return res;
3886
}
3887

    
3888
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3889
{
3890
    int res = 0;
3891
    while (order--) {
3892
        res   += *v1 * *v2++;
3893
        *v1++ += mul * *v3++;
3894
    }
3895
    return res;
3896
}
3897

    
3898
#define W0 2048
3899
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3900
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3901
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3902
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3903
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3904
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3905
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3906

    
3907
static void wmv2_idct_row(short * b)
3908
{
3909
    int s1,s2;
3910
    int a0,a1,a2,a3,a4,a5,a6,a7;
3911
    /*step 1*/
3912
    a1 = W1*b[1]+W7*b[7];
3913
    a7 = W7*b[1]-W1*b[7];
3914
    a5 = W5*b[5]+W3*b[3];
3915
    a3 = W3*b[5]-W5*b[3];
3916
    a2 = W2*b[2]+W6*b[6];
3917
    a6 = W6*b[2]-W2*b[6];
3918
    a0 = W0*b[0]+W0*b[4];
3919
    a4 = W0*b[0]-W0*b[4];
3920
    /*step 2*/
3921
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3922
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3923
    /*step 3*/
3924
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3925
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3926
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3927
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3928
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3929
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3930
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3931
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3932
}
3933
static void wmv2_idct_col(short * b)
3934
{
3935
    int s1,s2;
3936
    int a0,a1,a2,a3,a4,a5,a6,a7;
3937
    /*step 1, with extended precision*/
3938
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3939
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3940
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3941
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3942
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3943
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3944
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3945
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3946
    /*step 2*/
3947
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3948
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3949
    /*step 3*/
3950
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3951
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3952
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3953
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3954

    
3955
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3956
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3957
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3958
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3959
}
3960
void ff_wmv2_idct_c(short * block){
3961
    int i;
3962

    
3963
    for(i=0;i<64;i+=8){
3964
        wmv2_idct_row(block+i);
3965
    }
3966
    for(i=0;i<8;i++){
3967
        wmv2_idct_col(block+i);
3968
    }
3969
}
3970
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3971
 converted */
3972
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3973
{
3974
    ff_wmv2_idct_c(block);
3975
    ff_put_pixels_clamped_c(block, dest, line_size);
3976
}
3977
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3978
{
3979
    ff_wmv2_idct_c(block);
3980
    ff_add_pixels_clamped_c(block, dest, line_size);
3981
}
3982
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3983
{
3984
    j_rev_dct (block);
3985
    ff_put_pixels_clamped_c(block, dest, line_size);
3986
}
3987
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3988
{
3989
    j_rev_dct (block);
3990
    ff_add_pixels_clamped_c(block, dest, line_size);
3991
}
3992

    
3993
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3994
{
3995
    j_rev_dct4 (block);
3996
    put_pixels_clamped4_c(block, dest, line_size);
3997
}
3998
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3999
{
4000
    j_rev_dct4 (block);
4001
    add_pixels_clamped4_c(block, dest, line_size);
4002
}
4003

    
4004
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4005
{
4006
    j_rev_dct2 (block);
4007
    put_pixels_clamped2_c(block, dest, line_size);
4008
}
4009
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4010
{
4011
    j_rev_dct2 (block);
4012
    add_pixels_clamped2_c(block, dest, line_size);
4013
}
4014

    
4015
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4016
{
4017
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4018

    
4019
    dest[0] = cm[(block[0] + 4)>>3];
4020
}
4021
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4022
{
4023
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4024

    
4025
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4026
}
4027

    
4028
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4029

    
4030
/* init static data */
4031
av_cold void dsputil_static_init(void)
4032
{
4033
    int i;
4034

    
4035
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4036
    for(i=0;i<MAX_NEG_CROP;i++) {
4037
        ff_cropTbl[i] = 0;
4038
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4039
    }
4040

    
4041
    for(i=0;i<512;i++) {
4042
        ff_squareTbl[i] = (i - 256) * (i - 256);
4043
    }
4044

    
4045
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4046
}
4047

    
4048
int ff_check_alignment(void){
4049
    static int did_fail=0;
4050
    DECLARE_ALIGNED(16, int, aligned);
4051

    
4052
    if((intptr_t)&aligned & 15){
4053
        if(!did_fail){
4054
#if HAVE_MMX || HAVE_ALTIVEC
4055
            av_log(NULL, AV_LOG_ERROR,
4056
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4057
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4058
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4059
                "Do not report crashes to FFmpeg developers.\n");
4060
#endif
4061
            did_fail=1;
4062
        }
4063
        return -1;
4064
    }
4065
    return 0;
4066
}
4067

    
4068
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4069
{
4070
    int i;
4071

    
4072
    ff_check_alignment();
4073

    
4074
#if CONFIG_ENCODERS
4075
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4076
        c->fdct = fdct_ifast;
4077
        c->fdct248 = fdct_ifast248;
4078
    }
4079
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4080
        c->fdct = ff_faandct;
4081
        c->fdct248 = ff_faandct248;
4082
    }
4083
    else {
4084
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4085
        c->fdct248 = ff_fdct248_islow;
4086
    }
4087
#endif //CONFIG_ENCODERS
4088

    
4089
    if(avctx->lowres==1){
4090
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4091
            c->idct_put= ff_jref_idct4_put;
4092
            c->idct_add= ff_jref_idct4_add;
4093
        }else{
4094
            c->idct_put= ff_h264_lowres_idct_put_c;
4095
            c->idct_add= ff_h264_lowres_idct_add_c;
4096
        }
4097
        c->idct    = j_rev_dct4;
4098
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4099
    }else if(avctx->lowres==2){
4100
        c->idct_put= ff_jref_idct2_put;
4101
        c->idct_add= ff_jref_idct2_add;
4102
        c->idct    = j_rev_dct2;
4103
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4104
    }else if(avctx->lowres==3){
4105
        c->idct_put= ff_jref_idct1_put;
4106
        c->idct_add= ff_jref_idct1_add;
4107
        c->idct    = j_rev_dct1;
4108
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4109
    }else{
4110
        if(avctx->idct_algo==FF_IDCT_INT){
4111
            c->idct_put= ff_jref_idct_put;
4112
            c->idct_add= ff_jref_idct_add;
4113
            c->idct    = j_rev_dct;
4114
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4115
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4116
                avctx->idct_algo==FF_IDCT_VP3){
4117
            c->idct_put= ff_vp3_idct_put_c;
4118
            c->idct_add= ff_vp3_idct_add_c;
4119
            c->idct    = ff_vp3_idct_c;
4120
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4121
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4122
            c->idct_put= ff_wmv2_idct_put_c;
4123
            c->idct_add= ff_wmv2_idct_add_c;
4124
            c->idct    = ff_wmv2_idct_c;
4125
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4126
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4127
            c->idct_put= ff_faanidct_put;
4128
            c->idct_add= ff_faanidct_add;
4129
            c->idct    = ff_faanidct;
4130
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4131
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4132
            c->idct_put= ff_ea_idct_put_c;
4133
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4134
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4135
            c->idct     = ff_bink_idct_c;
4136
            c->idct_add = ff_bink_idct_add_c;
4137
            c->idct_put = ff_bink_idct_put_c;
4138
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4139
        }else{ //accurate/default
4140
            c->idct_put= ff_simple_idct_put;
4141
            c->idct_add= ff_simple_idct_add;
4142
            c->idct    = ff_simple_idct;
4143
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4144
        }
4145
    }
4146

    
4147
    c->get_pixels = get_pixels_c;
4148
    c->diff_pixels = diff_pixels_c;
4149
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
4150
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4151
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4152
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
4153
    c->add_pixels8 = add_pixels8_c;
4154
    c->add_pixels4 = add_pixels4_c;
4155
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4156
    c->emulated_edge_mc = ff_emulated_edge_mc;
4157
    c->gmc1 = gmc1_c;
4158
    c->gmc = ff_gmc_c;
4159
    c->clear_block = clear_block_c;
4160
    c->clear_blocks = clear_blocks_c;
4161
    c->pix_sum = pix_sum_c;
4162
    c->pix_norm1 = pix_norm1_c;
4163

    
4164
    c->fill_block_tab[0] = fill_block16_c;
4165
    c->fill_block_tab[1] = fill_block8_c;
4166
    c->scale_block = scale_block_c;
4167

    
4168
    /* TODO [0] 16  [1] 8 */
4169
    c->pix_abs[0][0] = pix_abs16_c;
4170
    c->pix_abs[0][1] = pix_abs16_x2_c;
4171
    c->pix_abs[0][2] = pix_abs16_y2_c;
4172
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4173
    c->pix_abs[1][0] = pix_abs8_c;
4174
    c->pix_abs[1][1] = pix_abs8_x2_c;
4175
    c->pix_abs[1][2] = pix_abs8_y2_c;
4176
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4177

    
4178
#define dspfunc(PFX, IDX, NUM) \
4179
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4180
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4181
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4182
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4183

    
4184
    dspfunc(put, 0, 16);
4185
    dspfunc(put_no_rnd, 0, 16);
4186
    dspfunc(put, 1, 8);
4187
    dspfunc(put_no_rnd, 1, 8);
4188
    dspfunc(put, 2, 4);
4189
    dspfunc(put, 3, 2);
4190

    
4191
    dspfunc(avg, 0, 16);
4192
    dspfunc(avg_no_rnd, 0, 16);
4193
    dspfunc(avg, 1, 8);
4194
    dspfunc(avg_no_rnd, 1, 8);
4195
    dspfunc(avg, 2, 4);
4196
    dspfunc(avg, 3, 2);
4197
#undef dspfunc
4198

    
4199
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4200
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4201

    
4202
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4203
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4204
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4205
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4206
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4207
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4208
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4209
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4210
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4211

    
4212
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4213
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4214
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4215
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4216
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4217
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4218
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4219
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4220
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4221

    
4222
#define dspfunc(PFX, IDX, NUM) \
4223
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4224
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4225
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4226
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4227
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4228
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4229
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4230
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4231
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4232
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4233
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4234
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4235
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4236
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4237
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4238
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4239

    
4240
    dspfunc(put_qpel, 0, 16);
4241
    dspfunc(put_no_rnd_qpel, 0, 16);
4242

    
4243
    dspfunc(avg_qpel, 0, 16);
4244
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4245

    
4246
    dspfunc(put_qpel, 1, 8);
4247
    dspfunc(put_no_rnd_qpel, 1, 8);
4248

    
4249
    dspfunc(avg_qpel, 1, 8);
4250
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4251

    
4252
    dspfunc(put_h264_qpel, 0, 16);
4253
    dspfunc(put_h264_qpel, 1, 8);
4254
    dspfunc(put_h264_qpel, 2, 4);
4255
    dspfunc(put_h264_qpel, 3, 2);
4256
    dspfunc(avg_h264_qpel, 0, 16);
4257
    dspfunc(avg_h264_qpel, 1, 8);
4258
    dspfunc(avg_h264_qpel, 2, 4);
4259

    
4260
#undef dspfunc
4261
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4262
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4263
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4264
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4265
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4266
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4267

    
4268
    c->draw_edges = draw_edges_c;
4269

    
4270
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4271
    ff_mlp_init(c, avctx);
4272
#endif
4273
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4274
    ff_intrax8dsp_init(c,avctx);
4275
#endif
4276
#if CONFIG_RV30_DECODER
4277
    ff_rv30dsp_init(c,avctx);
4278
#endif
4279
#if CONFIG_RV40_DECODER
4280
    ff_rv40dsp_init(c,avctx);
4281
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4282
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4283
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4284
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4285
#endif
4286

    
4287
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4288
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4289
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4290
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4291
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4292
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4293
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4294
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4295

    
4296
#define SET_CMP_FUNC(name) \
4297
    c->name[0]= name ## 16_c;\
4298
    c->name[1]= name ## 8x8_c;