Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 187a5379

History | View | Annotate | Download (154 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of Libav.
9
 *
10
 * Libav is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * Libav is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with Libav; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* Input permutation for the simple_idct_mmx */
100
static const uint8_t simple_mmx_permutation[64]={
101
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109
};
110

    
111
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
112

    
113
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114
    int i;
115
    int end;
116

    
117
    st->scantable= src_scantable;
118

    
119
    for(i=0; i<64; i++){
120
        int j;
121
        j = src_scantable[i];
122
        st->permutated[i] = permutation[j];
123
#if ARCH_PPC
124
        st->inverse[j] = i;
125
#endif
126
    }
127

    
128
    end=-1;
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = st->permutated[i];
132
        if(j>end) end=j;
133
        st->raster_end[i]= end;
134
    }
135
}
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = ff_squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= av_bswap32(src[i+0]);
212
        dst[i+1]= av_bswap32(src[i+1]);
213
        dst[i+2]= av_bswap32(src[i+2]);
214
        dst[i+3]= av_bswap32(src[i+3]);
215
        dst[i+4]= av_bswap32(src[i+4]);
216
        dst[i+5]= av_bswap32(src[i+5]);
217
        dst[i+6]= av_bswap32(src[i+6]);
218
        dst[i+7]= av_bswap32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
    }
223
}
224

    
225
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
226
{
227
    while (len--)
228
        *dst++ = av_bswap16(*src++);
229
}
230

    
231
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232
{
233
    int s, i;
234
    uint32_t *sq = ff_squareTbl + 256;
235

    
236
    s = 0;
237
    for (i = 0; i < h; i++) {
238
        s += sq[pix1[0] - pix2[0]];
239
        s += sq[pix1[1] - pix2[1]];
240
        s += sq[pix1[2] - pix2[2]];
241
        s += sq[pix1[3] - pix2[3]];
242
        pix1 += line_size;
243
        pix2 += line_size;
244
    }
245
    return s;
246
}
247

    
248
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249
{
250
    int s, i;
251
    uint32_t *sq = ff_squareTbl + 256;
252

    
253
    s = 0;
254
    for (i = 0; i < h; i++) {
255
        s += sq[pix1[0] - pix2[0]];
256
        s += sq[pix1[1] - pix2[1]];
257
        s += sq[pix1[2] - pix2[2]];
258
        s += sq[pix1[3] - pix2[3]];
259
        s += sq[pix1[4] - pix2[4]];
260
        s += sq[pix1[5] - pix2[5]];
261
        s += sq[pix1[6] - pix2[6]];
262
        s += sq[pix1[7] - pix2[7]];
263
        pix1 += line_size;
264
        pix2 += line_size;
265
    }
266
    return s;
267
}
268

    
269
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270
{
271
    int s, i;
272
    uint32_t *sq = ff_squareTbl + 256;
273

    
274
    s = 0;
275
    for (i = 0; i < h; i++) {
276
        s += sq[pix1[ 0] - pix2[ 0]];
277
        s += sq[pix1[ 1] - pix2[ 1]];
278
        s += sq[pix1[ 2] - pix2[ 2]];
279
        s += sq[pix1[ 3] - pix2[ 3]];
280
        s += sq[pix1[ 4] - pix2[ 4]];
281
        s += sq[pix1[ 5] - pix2[ 5]];
282
        s += sq[pix1[ 6] - pix2[ 6]];
283
        s += sq[pix1[ 7] - pix2[ 7]];
284
        s += sq[pix1[ 8] - pix2[ 8]];
285
        s += sq[pix1[ 9] - pix2[ 9]];
286
        s += sq[pix1[10] - pix2[10]];
287
        s += sq[pix1[11] - pix2[11]];
288
        s += sq[pix1[12] - pix2[12]];
289
        s += sq[pix1[13] - pix2[13]];
290
        s += sq[pix1[14] - pix2[14]];
291
        s += sq[pix1[15] - pix2[15]];
292

    
293
        pix1 += line_size;
294
        pix2 += line_size;
295
    }
296
    return s;
297
}
298

    
299
/* draw the edges of width 'w' of an image of size width, height */
300
//FIXME check that this is ok for mpeg4 interlaced
301
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
302
{
303
    uint8_t *ptr, *last_line;
304
    int i;
305

    
306
    /* left and right */
307
    ptr = buf;
308
    for(i=0;i<height;i++) {
309
        memset(ptr - w, ptr[0], w);
310
        memset(ptr + width, ptr[width-1], w);
311
        ptr += wrap;
312
    }
313

    
314
    /* top and bottom + corners */
315
    buf -= w;
316
    last_line = buf + (height - 1) * wrap;
317
    if (sides & EDGE_TOP)
318
        for(i = 0; i < w; i++)
319
            memcpy(buf - (i + 1) * wrap, buf, width + w + w); // top
320
    if (sides & EDGE_BOTTOM)
321
        for (i = 0; i < w; i++)
322
            memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); // bottom
323
}
324

    
325
/**
326
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
327
 * @param buf destination buffer
328
 * @param src source buffer
329
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
330
 * @param block_w width of block
331
 * @param block_h height of block
332
 * @param src_x x coordinate of the top left sample of the block in the source buffer
333
 * @param src_y y coordinate of the top left sample of the block in the source buffer
334
 * @param w width of the source buffer
335
 * @param h height of the source buffer
336
 */
337
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
338
                                    int src_x, int src_y, int w, int h){
339
    int x, y;
340
    int start_y, start_x, end_y, end_x;
341

    
342
    if(src_y>= h){
343
        src+= (h-1-src_y)*linesize;
344
        src_y=h-1;
345
    }else if(src_y<=-block_h){
346
        src+= (1-block_h-src_y)*linesize;
347
        src_y=1-block_h;
348
    }
349
    if(src_x>= w){
350
        src+= (w-1-src_x);
351
        src_x=w-1;
352
    }else if(src_x<=-block_w){
353
        src+= (1-block_w-src_x);
354
        src_x=1-block_w;
355
    }
356

    
357
    start_y= FFMAX(0, -src_y);
358
    start_x= FFMAX(0, -src_x);
359
    end_y= FFMIN(block_h, h-src_y);
360
    end_x= FFMIN(block_w, w-src_x);
361
    assert(start_y < end_y && block_h);
362
    assert(start_x < end_x && block_w);
363

    
364
    w    = end_x - start_x;
365
    src += start_y*linesize + start_x;
366
    buf += start_x;
367

    
368
    //top
369
    for(y=0; y<start_y; y++){
370
        memcpy(buf, src, w);
371
        buf += linesize;
372
    }
373

    
374
    // copy existing part
375
    for(; y<end_y; y++){
376
        memcpy(buf, src, w);
377
        src += linesize;
378
        buf += linesize;
379
    }
380

    
381
    //bottom
382
    src -= linesize;
383
    for(; y<block_h; y++){
384
        memcpy(buf, src, w);
385
        buf += linesize;
386
    }
387

    
388
    buf -= block_h * linesize + start_x;
389
    while (block_h--){
390
       //left
391
        for(x=0; x<start_x; x++){
392
            buf[x] = buf[start_x];
393
        }
394

    
395
       //right
396
        for(x=end_x; x<block_w; x++){
397
            buf[x] = buf[end_x - 1];
398
        }
399
        buf += linesize;
400
    }
401
}
402

    
403
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
404
{
405
    int i;
406

    
407
    /* read the pixels */
408
    for(i=0;i<8;i++) {
409
        block[0] = pixels[0];
410
        block[1] = pixels[1];
411
        block[2] = pixels[2];
412
        block[3] = pixels[3];
413
        block[4] = pixels[4];
414
        block[5] = pixels[5];
415
        block[6] = pixels[6];
416
        block[7] = pixels[7];
417
        pixels += line_size;
418
        block += 8;
419
    }
420
}
421

    
422
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
423
                          const uint8_t *s2, int stride){
424
    int i;
425

    
426
    /* read the pixels */
427
    for(i=0;i<8;i++) {
428
        block[0] = s1[0] - s2[0];
429
        block[1] = s1[1] - s2[1];
430
        block[2] = s1[2] - s2[2];
431
        block[3] = s1[3] - s2[3];
432
        block[4] = s1[4] - s2[4];
433
        block[5] = s1[5] - s2[5];
434
        block[6] = s1[6] - s2[6];
435
        block[7] = s1[7] - s2[7];
436
        s1 += stride;
437
        s2 += stride;
438
        block += 8;
439
    }
440
}
441

    
442

    
443
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
444
                             int line_size)
445
{
446
    int i;
447
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
448

    
449
    /* read the pixels */
450
    for(i=0;i<8;i++) {
451
        pixels[0] = cm[block[0]];
452
        pixels[1] = cm[block[1]];
453
        pixels[2] = cm[block[2]];
454
        pixels[3] = cm[block[3]];
455
        pixels[4] = cm[block[4]];
456
        pixels[5] = cm[block[5]];
457
        pixels[6] = cm[block[6]];
458
        pixels[7] = cm[block[7]];
459

    
460
        pixels += line_size;
461
        block += 8;
462
    }
463
}
464

    
465
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
466
                                 int line_size)
467
{
468
    int i;
469
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
470

    
471
    /* read the pixels */
472
    for(i=0;i<4;i++) {
473
        pixels[0] = cm[block[0]];
474
        pixels[1] = cm[block[1]];
475
        pixels[2] = cm[block[2]];
476
        pixels[3] = cm[block[3]];
477

    
478
        pixels += line_size;
479
        block += 8;
480
    }
481
}
482

    
483
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
484
                                 int line_size)
485
{
486
    int i;
487
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
488

    
489
    /* read the pixels */
490
    for(i=0;i<2;i++) {
491
        pixels[0] = cm[block[0]];
492
        pixels[1] = cm[block[1]];
493

    
494
        pixels += line_size;
495
        block += 8;
496
    }
497
}
498

    
499
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
500
                                    uint8_t *restrict pixels,
501
                                    int line_size)
502
{
503
    int i, j;
504

    
505
    for (i = 0; i < 8; i++) {
506
        for (j = 0; j < 8; j++) {
507
            if (*block < -128)
508
                *pixels = 0;
509
            else if (*block > 127)
510
                *pixels = 255;
511
            else
512
                *pixels = (uint8_t)(*block + 128);
513
            block++;
514
            pixels++;
515
        }
516
        pixels += (line_size - 8);
517
    }
518
}
519

    
520
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
521
                                    int line_size)
522
{
523
    int i;
524

    
525
    /* read the pixels */
526
    for(i=0;i<8;i++) {
527
        pixels[0] = block[0];
528
        pixels[1] = block[1];
529
        pixels[2] = block[2];
530
        pixels[3] = block[3];
531
        pixels[4] = block[4];
532
        pixels[5] = block[5];
533
        pixels[6] = block[6];
534
        pixels[7] = block[7];
535

    
536
        pixels += line_size;
537
        block += 8;
538
    }
539
}
540

    
541
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
542
                             int line_size)
543
{
544
    int i;
545
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
546

    
547
    /* read the pixels */
548
    for(i=0;i<8;i++) {
549
        pixels[0] = cm[pixels[0] + block[0]];
550
        pixels[1] = cm[pixels[1] + block[1]];
551
        pixels[2] = cm[pixels[2] + block[2]];
552
        pixels[3] = cm[pixels[3] + block[3]];
553
        pixels[4] = cm[pixels[4] + block[4]];
554
        pixels[5] = cm[pixels[5] + block[5]];
555
        pixels[6] = cm[pixels[6] + block[6]];
556
        pixels[7] = cm[pixels[7] + block[7]];
557
        pixels += line_size;
558
        block += 8;
559
    }
560
}
561

    
562
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
563
                          int line_size)
564
{
565
    int i;
566
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
567

    
568
    /* read the pixels */
569
    for(i=0;i<4;i++) {
570
        pixels[0] = cm[pixels[0] + block[0]];
571
        pixels[1] = cm[pixels[1] + block[1]];
572
        pixels[2] = cm[pixels[2] + block[2]];
573
        pixels[3] = cm[pixels[3] + block[3]];
574
        pixels += line_size;
575
        block += 8;
576
    }
577
}
578

    
579
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
580
                          int line_size)
581
{
582
    int i;
583
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584

    
585
    /* read the pixels */
586
    for(i=0;i<2;i++) {
587
        pixels[0] = cm[pixels[0] + block[0]];
588
        pixels[1] = cm[pixels[1] + block[1]];
589
        pixels += line_size;
590
        block += 8;
591
    }
592
}
593

    
594
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
595
{
596
    int i;
597
    for(i=0;i<8;i++) {
598
        pixels[0] += block[0];
599
        pixels[1] += block[1];
600
        pixels[2] += block[2];
601
        pixels[3] += block[3];
602
        pixels[4] += block[4];
603
        pixels[5] += block[5];
604
        pixels[6] += block[6];
605
        pixels[7] += block[7];
606
        pixels += line_size;
607
        block += 8;
608
    }
609
}
610

    
611
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
612
{
613
    int i;
614
    for(i=0;i<4;i++) {
615
        pixels[0] += block[0];
616
        pixels[1] += block[1];
617
        pixels[2] += block[2];
618
        pixels[3] += block[3];
619
        pixels += line_size;
620
        block += 4;
621
    }
622
}
623

    
624
static int sum_abs_dctelem_c(DCTELEM *block)
625
{
626
    int sum=0, i;
627
    for(i=0; i<64; i++)
628
        sum+= FFABS(block[i]);
629
    return sum;
630
}
631

    
632
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
633
{
634
    int i;
635

    
636
    for (i = 0; i < h; i++) {
637
        memset(block, value, 16);
638
        block += line_size;
639
    }
640
}
641

    
642
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
643
{
644
    int i;
645

    
646
    for (i = 0; i < h; i++) {
647
        memset(block, value, 8);
648
        block += line_size;
649
    }
650
}
651

    
652
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
653
{
654
    int i, j;
655
    uint16_t *dst1 = (uint16_t *) dst;
656
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
657

    
658
    for (j = 0; j < 8; j++) {
659
        for (i = 0; i < 8; i++) {
660
            dst1[i] = dst2[i] = src[i] * 0x0101;
661
        }
662
        src  += 8;
663
        dst1 += linesize;
664
        dst2 += linesize;
665
    }
666
}
667

    
668
#if 0
669

670
#define PIXOP2(OPNAME, OP) \
671
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
672
{\
673
    int i;\
674
    for(i=0; i<h; i++){\
675
        OP(*((uint64_t*)block), AV_RN64(pixels));\
676
        pixels+=line_size;\
677
        block +=line_size;\
678
    }\
679
}\
680
\
681
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
682
{\
683
    int i;\
684
    for(i=0; i<h; i++){\
685
        const uint64_t a= AV_RN64(pixels  );\
686
        const uint64_t b= AV_RN64(pixels+1);\
687
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
688
        pixels+=line_size;\
689
        block +=line_size;\
690
    }\
691
}\
692
\
693
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
694
{\
695
    int i;\
696
    for(i=0; i<h; i++){\
697
        const uint64_t a= AV_RN64(pixels  );\
698
        const uint64_t b= AV_RN64(pixels+1);\
699
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
700
        pixels+=line_size;\
701
        block +=line_size;\
702
    }\
703
}\
704
\
705
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706
{\
707
    int i;\
708
    for(i=0; i<h; i++){\
709
        const uint64_t a= AV_RN64(pixels          );\
710
        const uint64_t b= AV_RN64(pixels+line_size);\
711
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
712
        pixels+=line_size;\
713
        block +=line_size;\
714
    }\
715
}\
716
\
717
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
718
{\
719
    int i;\
720
    for(i=0; i<h; i++){\
721
        const uint64_t a= AV_RN64(pixels          );\
722
        const uint64_t b= AV_RN64(pixels+line_size);\
723
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
724
        pixels+=line_size;\
725
        block +=line_size;\
726
    }\
727
}\
728
\
729
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
730
{\
731
        int i;\
732
        const uint64_t a= AV_RN64(pixels  );\
733
        const uint64_t b= AV_RN64(pixels+1);\
734
        uint64_t l0=  (a&0x0303030303030303ULL)\
735
                    + (b&0x0303030303030303ULL)\
736
                    + 0x0202020202020202ULL;\
737
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
738
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
739
        uint64_t l1,h1;\
740
\
741
        pixels+=line_size;\
742
        for(i=0; i<h; i+=2){\
743
            uint64_t a= AV_RN64(pixels  );\
744
            uint64_t b= AV_RN64(pixels+1);\
745
            l1=  (a&0x0303030303030303ULL)\
746
               + (b&0x0303030303030303ULL);\
747
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
748
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
749
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
750
            pixels+=line_size;\
751
            block +=line_size;\
752
            a= AV_RN64(pixels  );\
753
            b= AV_RN64(pixels+1);\
754
            l0=  (a&0x0303030303030303ULL)\
755
               + (b&0x0303030303030303ULL)\
756
               + 0x0202020202020202ULL;\
757
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
758
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
759
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
760
            pixels+=line_size;\
761
            block +=line_size;\
762
        }\
763
}\
764
\
765
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
766
{\
767
        int i;\
768
        const uint64_t a= AV_RN64(pixels  );\
769
        const uint64_t b= AV_RN64(pixels+1);\
770
        uint64_t l0=  (a&0x0303030303030303ULL)\
771
                    + (b&0x0303030303030303ULL)\
772
                    + 0x0101010101010101ULL;\
773
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
774
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
775
        uint64_t l1,h1;\
776
\
777
        pixels+=line_size;\
778
        for(i=0; i<h; i+=2){\
779
            uint64_t a= AV_RN64(pixels  );\
780
            uint64_t b= AV_RN64(pixels+1);\
781
            l1=  (a&0x0303030303030303ULL)\
782
               + (b&0x0303030303030303ULL);\
783
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
784
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
785
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
786
            pixels+=line_size;\
787
            block +=line_size;\
788
            a= AV_RN64(pixels  );\
789
            b= AV_RN64(pixels+1);\
790
            l0=  (a&0x0303030303030303ULL)\
791
               + (b&0x0303030303030303ULL)\
792
               + 0x0101010101010101ULL;\
793
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
794
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
795
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
796
            pixels+=line_size;\
797
            block +=line_size;\
798
        }\
799
}\
800
\
801
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
802
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
803
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
804
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
805
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
806
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
807
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
808

809
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
810
#else // 64 bit variant
811

    
812
#define PIXOP2(OPNAME, OP) \
813
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
814
    int i;\
815
    for(i=0; i<h; i++){\
816
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
817
        pixels+=line_size;\
818
        block +=line_size;\
819
    }\
820
}\
821
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
822
    int i;\
823
    for(i=0; i<h; i++){\
824
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
825
        pixels+=line_size;\
826
        block +=line_size;\
827
    }\
828
}\
829
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830
    int i;\
831
    for(i=0; i<h; i++){\
832
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
833
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
834
        pixels+=line_size;\
835
        block +=line_size;\
836
    }\
837
}\
838
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
840
}\
841
\
842
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
843
                                                int src_stride1, int src_stride2, int h){\
844
    int i;\
845
    for(i=0; i<h; i++){\
846
        uint32_t a,b;\
847
        a= AV_RN32(&src1[i*src_stride1  ]);\
848
        b= AV_RN32(&src2[i*src_stride2  ]);\
849
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
850
        a= AV_RN32(&src1[i*src_stride1+4]);\
851
        b= AV_RN32(&src2[i*src_stride2+4]);\
852
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
853
    }\
854
}\
855
\
856
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
857
                                                int src_stride1, int src_stride2, int h){\
858
    int i;\
859
    for(i=0; i<h; i++){\
860
        uint32_t a,b;\
861
        a= AV_RN32(&src1[i*src_stride1  ]);\
862
        b= AV_RN32(&src2[i*src_stride2  ]);\
863
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
864
        a= AV_RN32(&src1[i*src_stride1+4]);\
865
        b= AV_RN32(&src2[i*src_stride2+4]);\
866
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
867
    }\
868
}\
869
\
870
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
871
                                                int src_stride1, int src_stride2, int h){\
872
    int i;\
873
    for(i=0; i<h; i++){\
874
        uint32_t a,b;\
875
        a= AV_RN32(&src1[i*src_stride1  ]);\
876
        b= AV_RN32(&src2[i*src_stride2  ]);\
877
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
878
    }\
879
}\
880
\
881
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
882
                                                int src_stride1, int src_stride2, int h){\
883
    int i;\
884
    for(i=0; i<h; i++){\
885
        uint32_t a,b;\
886
        a= AV_RN16(&src1[i*src_stride1  ]);\
887
        b= AV_RN16(&src2[i*src_stride2  ]);\
888
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
889
    }\
890
}\
891
\
892
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
893
                                                int src_stride1, int src_stride2, int h){\
894
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
895
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
896
}\
897
\
898
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
899
                                                int src_stride1, int src_stride2, int h){\
900
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
901
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
902
}\
903
\
904
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
906
}\
907
\
908
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
910
}\
911
\
912
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
913
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
914
}\
915
\
916
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
918
}\
919
\
920
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
921
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
922
    int i;\
923
    for(i=0; i<h; i++){\
924
        uint32_t a, b, c, d, l0, l1, h0, h1;\
925
        a= AV_RN32(&src1[i*src_stride1]);\
926
        b= AV_RN32(&src2[i*src_stride2]);\
927
        c= AV_RN32(&src3[i*src_stride3]);\
928
        d= AV_RN32(&src4[i*src_stride4]);\
929
        l0=  (a&0x03030303UL)\
930
           + (b&0x03030303UL)\
931
           + 0x02020202UL;\
932
        h0= ((a&0xFCFCFCFCUL)>>2)\
933
          + ((b&0xFCFCFCFCUL)>>2);\
934
        l1=  (c&0x03030303UL)\
935
           + (d&0x03030303UL);\
936
        h1= ((c&0xFCFCFCFCUL)>>2)\
937
          + ((d&0xFCFCFCFCUL)>>2);\
938
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
939
        a= AV_RN32(&src1[i*src_stride1+4]);\
940
        b= AV_RN32(&src2[i*src_stride2+4]);\
941
        c= AV_RN32(&src3[i*src_stride3+4]);\
942
        d= AV_RN32(&src4[i*src_stride4+4]);\
943
        l0=  (a&0x03030303UL)\
944
           + (b&0x03030303UL)\
945
           + 0x02020202UL;\
946
        h0= ((a&0xFCFCFCFCUL)>>2)\
947
          + ((b&0xFCFCFCFCUL)>>2);\
948
        l1=  (c&0x03030303UL)\
949
           + (d&0x03030303UL);\
950
        h1= ((c&0xFCFCFCFCUL)>>2)\
951
          + ((d&0xFCFCFCFCUL)>>2);\
952
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
953
    }\
954
}\
955
\
956
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
957
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
958
}\
959
\
960
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
961
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
962
}\
963
\
964
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
965
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
966
}\
967
\
968
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
969
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
970
}\
971
\
972
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
973
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
974
    int i;\
975
    for(i=0; i<h; i++){\
976
        uint32_t a, b, c, d, l0, l1, h0, h1;\
977
        a= AV_RN32(&src1[i*src_stride1]);\
978
        b= AV_RN32(&src2[i*src_stride2]);\
979
        c= AV_RN32(&src3[i*src_stride3]);\
980
        d= AV_RN32(&src4[i*src_stride4]);\
981
        l0=  (a&0x03030303UL)\
982
           + (b&0x03030303UL)\
983
           + 0x01010101UL;\
984
        h0= ((a&0xFCFCFCFCUL)>>2)\
985
          + ((b&0xFCFCFCFCUL)>>2);\
986
        l1=  (c&0x03030303UL)\
987
           + (d&0x03030303UL);\
988
        h1= ((c&0xFCFCFCFCUL)>>2)\
989
          + ((d&0xFCFCFCFCUL)>>2);\
990
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
991
        a= AV_RN32(&src1[i*src_stride1+4]);\
992
        b= AV_RN32(&src2[i*src_stride2+4]);\
993
        c= AV_RN32(&src3[i*src_stride3+4]);\
994
        d= AV_RN32(&src4[i*src_stride4+4]);\
995
        l0=  (a&0x03030303UL)\
996
           + (b&0x03030303UL)\
997
           + 0x01010101UL;\
998
        h0= ((a&0xFCFCFCFCUL)>>2)\
999
          + ((b&0xFCFCFCFCUL)>>2);\
1000
        l1=  (c&0x03030303UL)\
1001
           + (d&0x03030303UL);\
1002
        h1= ((c&0xFCFCFCFCUL)>>2)\
1003
          + ((d&0xFCFCFCFCUL)>>2);\
1004
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1005
    }\
1006
}\
1007
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1008
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1009
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1010
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1011
}\
1012
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1013
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1014
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1015
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1016
}\
1017
\
1018
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1019
{\
1020
        int i, a0, b0, a1, b1;\
1021
        a0= pixels[0];\
1022
        b0= pixels[1] + 2;\
1023
        a0 += b0;\
1024
        b0 += pixels[2];\
1025
\
1026
        pixels+=line_size;\
1027
        for(i=0; i<h; i+=2){\
1028
            a1= pixels[0];\
1029
            b1= pixels[1];\
1030
            a1 += b1;\
1031
            b1 += pixels[2];\
1032
\
1033
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1034
            block[1]= (b1+b0)>>2;\
1035
\
1036
            pixels+=line_size;\
1037
            block +=line_size;\
1038
\
1039
            a0= pixels[0];\
1040
            b0= pixels[1] + 2;\
1041
            a0 += b0;\
1042
            b0 += pixels[2];\
1043
\
1044
            block[0]= (a1+a0)>>2;\
1045
            block[1]= (b1+b0)>>2;\
1046
            pixels+=line_size;\
1047
            block +=line_size;\
1048
        }\
1049
}\
1050
\
1051
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1052
{\
1053
        int i;\
1054
        const uint32_t a= AV_RN32(pixels  );\
1055
        const uint32_t b= AV_RN32(pixels+1);\
1056
        uint32_t l0=  (a&0x03030303UL)\
1057
                    + (b&0x03030303UL)\
1058
                    + 0x02020202UL;\
1059
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1060
                   + ((b&0xFCFCFCFCUL)>>2);\
1061
        uint32_t l1,h1;\
1062
\
1063
        pixels+=line_size;\
1064
        for(i=0; i<h; i+=2){\
1065
            uint32_t a= AV_RN32(pixels  );\
1066
            uint32_t b= AV_RN32(pixels+1);\
1067
            l1=  (a&0x03030303UL)\
1068
               + (b&0x03030303UL);\
1069
            h1= ((a&0xFCFCFCFCUL)>>2)\
1070
              + ((b&0xFCFCFCFCUL)>>2);\
1071
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1072
            pixels+=line_size;\
1073
            block +=line_size;\
1074
            a= AV_RN32(pixels  );\
1075
            b= AV_RN32(pixels+1);\
1076
            l0=  (a&0x03030303UL)\
1077
               + (b&0x03030303UL)\
1078
               + 0x02020202UL;\
1079
            h0= ((a&0xFCFCFCFCUL)>>2)\
1080
              + ((b&0xFCFCFCFCUL)>>2);\
1081
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1082
            pixels+=line_size;\
1083
            block +=line_size;\
1084
        }\
1085
}\
1086
\
1087
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1088
{\
1089
    int j;\
1090
    for(j=0; j<2; j++){\
1091
        int i;\
1092
        const uint32_t a= AV_RN32(pixels  );\
1093
        const uint32_t b= AV_RN32(pixels+1);\
1094
        uint32_t l0=  (a&0x03030303UL)\
1095
                    + (b&0x03030303UL)\
1096
                    + 0x02020202UL;\
1097
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1098
                   + ((b&0xFCFCFCFCUL)>>2);\
1099
        uint32_t l1,h1;\
1100
\
1101
        pixels+=line_size;\
1102
        for(i=0; i<h; i+=2){\
1103
            uint32_t a= AV_RN32(pixels  );\
1104
            uint32_t b= AV_RN32(pixels+1);\
1105
            l1=  (a&0x03030303UL)\
1106
               + (b&0x03030303UL);\
1107
            h1= ((a&0xFCFCFCFCUL)>>2)\
1108
              + ((b&0xFCFCFCFCUL)>>2);\
1109
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110
            pixels+=line_size;\
1111
            block +=line_size;\
1112
            a= AV_RN32(pixels  );\
1113
            b= AV_RN32(pixels+1);\
1114
            l0=  (a&0x03030303UL)\
1115
               + (b&0x03030303UL)\
1116
               + 0x02020202UL;\
1117
            h0= ((a&0xFCFCFCFCUL)>>2)\
1118
              + ((b&0xFCFCFCFCUL)>>2);\
1119
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1120
            pixels+=line_size;\
1121
            block +=line_size;\
1122
        }\
1123
        pixels+=4-line_size*(h+1);\
1124
        block +=4-line_size*h;\
1125
    }\
1126
}\
1127
\
1128
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1129
{\
1130
    int j;\
1131
    for(j=0; j<2; j++){\
1132
        int i;\
1133
        const uint32_t a= AV_RN32(pixels  );\
1134
        const uint32_t b= AV_RN32(pixels+1);\
1135
        uint32_t l0=  (a&0x03030303UL)\
1136
                    + (b&0x03030303UL)\
1137
                    + 0x01010101UL;\
1138
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139
                   + ((b&0xFCFCFCFCUL)>>2);\
1140
        uint32_t l1,h1;\
1141
\
1142
        pixels+=line_size;\
1143
        for(i=0; i<h; i+=2){\
1144
            uint32_t a= AV_RN32(pixels  );\
1145
            uint32_t b= AV_RN32(pixels+1);\
1146
            l1=  (a&0x03030303UL)\
1147
               + (b&0x03030303UL);\
1148
            h1= ((a&0xFCFCFCFCUL)>>2)\
1149
              + ((b&0xFCFCFCFCUL)>>2);\
1150
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151
            pixels+=line_size;\
1152
            block +=line_size;\
1153
            a= AV_RN32(pixels  );\
1154
            b= AV_RN32(pixels+1);\
1155
            l0=  (a&0x03030303UL)\
1156
               + (b&0x03030303UL)\
1157
               + 0x01010101UL;\
1158
            h0= ((a&0xFCFCFCFCUL)>>2)\
1159
              + ((b&0xFCFCFCFCUL)>>2);\
1160
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161
            pixels+=line_size;\
1162
            block +=line_size;\
1163
        }\
1164
        pixels+=4-line_size*(h+1);\
1165
        block +=4-line_size*h;\
1166
    }\
1167
}\
1168
\
1169
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1170
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1171
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1172
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1173
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1174
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1175
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1176
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1177

    
1178
#define op_avg(a, b) a = rnd_avg32(a, b)
1179
#endif
1180
#define op_put(a, b) a = b
1181

    
1182
PIXOP2(avg, op_avg)
1183
PIXOP2(put, op_put)
1184
#undef op_avg
1185
#undef op_put
1186

    
1187
#define put_no_rnd_pixels8_c  put_pixels8_c
1188
#define put_no_rnd_pixels16_c put_pixels16_c
1189

    
1190
#define avg2(a,b) ((a+b+1)>>1)
1191
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1192

    
1193
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1194
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1195
}
1196

    
1197
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1198
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1199
}
1200

    
1201
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1202
{
1203
    const int A=(16-x16)*(16-y16);
1204
    const int B=(   x16)*(16-y16);
1205
    const int C=(16-x16)*(   y16);
1206
    const int D=(   x16)*(   y16);
1207
    int i;
1208

    
1209
    for(i=0; i<h; i++)
1210
    {
1211
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1212
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1213
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1214
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1215
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1216
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1217
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1218
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1219
        dst+= stride;
1220
        src+= stride;
1221
    }
1222
}
1223

    
1224
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1225
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1226
{
1227
    int y, vx, vy;
1228
    const int s= 1<<shift;
1229

    
1230
    width--;
1231
    height--;
1232

    
1233
    for(y=0; y<h; y++){
1234
        int x;
1235

    
1236
        vx= ox;
1237
        vy= oy;
1238
        for(x=0; x<8; x++){ //XXX FIXME optimize
1239
            int src_x, src_y, frac_x, frac_y, index;
1240

    
1241
            src_x= vx>>16;
1242
            src_y= vy>>16;
1243
            frac_x= src_x&(s-1);
1244
            frac_y= src_y&(s-1);
1245
            src_x>>=shift;
1246
            src_y>>=shift;
1247

    
1248
            if((unsigned)src_x < width){
1249
                if((unsigned)src_y < height){
1250
                    index= src_x + src_y*stride;
1251
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1252
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1253
                                        + (  src[index+stride  ]*(s-frac_x)
1254
                                           + src[index+stride+1]*   frac_x )*   frac_y
1255
                                        + r)>>(shift*2);
1256
                }else{
1257
                    index= src_x + av_clip(src_y, 0, height)*stride;
1258
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1259
                                          + src[index       +1]*   frac_x )*s
1260
                                        + r)>>(shift*2);
1261
                }
1262
            }else{
1263
                if((unsigned)src_y < height){
1264
                    index= av_clip(src_x, 0, width) + src_y*stride;
1265
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1266
                                           + src[index+stride  ]*   frac_y )*s
1267
                                        + r)>>(shift*2);
1268
                }else{
1269
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1270
                    dst[y*stride + x]=    src[index         ];
1271
                }
1272
            }
1273

    
1274
            vx+= dxx;
1275
            vy+= dyx;
1276
        }
1277
        ox += dxy;
1278
        oy += dyy;
1279
    }
1280
}
1281

    
1282
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283
    switch(width){
1284
    case 2: put_pixels2_c (dst, src, stride, height); break;
1285
    case 4: put_pixels4_c (dst, src, stride, height); break;
1286
    case 8: put_pixels8_c (dst, src, stride, height); break;
1287
    case 16:put_pixels16_c(dst, src, stride, height); break;
1288
    }
1289
}
1290

    
1291
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292
    int i,j;
1293
    for (i=0; i < height; i++) {
1294
      for (j=0; j < width; j++) {
1295
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1296
      }
1297
      src += stride;
1298
      dst += stride;
1299
    }
1300
}
1301

    
1302
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303
    int i,j;
1304
    for (i=0; i < height; i++) {
1305
      for (j=0; j < width; j++) {
1306
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1307
      }
1308
      src += stride;
1309
      dst += stride;
1310
    }
1311
}
1312

    
1313
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314
    int i,j;
1315
    for (i=0; i < height; i++) {
1316
      for (j=0; j < width; j++) {
1317
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1318
      }
1319
      src += stride;
1320
      dst += stride;
1321
    }
1322
}
1323

    
1324
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325
    int i,j;
1326
    for (i=0; i < height; i++) {
1327
      for (j=0; j < width; j++) {
1328
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1329
      }
1330
      src += stride;
1331
      dst += stride;
1332
    }
1333
}
1334

    
1335
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336
    int i,j;
1337
    for (i=0; i < height; i++) {
1338
      for (j=0; j < width; j++) {
1339
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1340
      }
1341
      src += stride;
1342
      dst += stride;
1343
    }
1344
}
1345

    
1346
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347
    int i,j;
1348
    for (i=0; i < height; i++) {
1349
      for (j=0; j < width; j++) {
1350
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1351
      }
1352
      src += stride;
1353
      dst += stride;
1354
    }
1355
}
1356

    
1357
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358
    int i,j;
1359
    for (i=0; i < height; i++) {
1360
      for (j=0; j < width; j++) {
1361
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1362
      }
1363
      src += stride;
1364
      dst += stride;
1365
    }
1366
}
1367

    
1368
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369
    int i,j;
1370
    for (i=0; i < height; i++) {
1371
      for (j=0; j < width; j++) {
1372
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1373
      }
1374
      src += stride;
1375
      dst += stride;
1376
    }
1377
}
1378

    
1379
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380
    switch(width){
1381
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1382
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1383
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1384
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1385
    }
1386
}
1387

    
1388
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389
    int i,j;
1390
    for (i=0; i < height; i++) {
1391
      for (j=0; j < width; j++) {
1392
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1393
      }
1394
      src += stride;
1395
      dst += stride;
1396
    }
1397
}
1398

    
1399
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400
    int i,j;
1401
    for (i=0; i < height; i++) {
1402
      for (j=0; j < width; j++) {
1403
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1404
      }
1405
      src += stride;
1406
      dst += stride;
1407
    }
1408
}
1409

    
1410
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411
    int i,j;
1412
    for (i=0; i < height; i++) {
1413
      for (j=0; j < width; j++) {
1414
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1415
      }
1416
      src += stride;
1417
      dst += stride;
1418
    }
1419
}
1420

    
1421
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422
    int i,j;
1423
    for (i=0; i < height; i++) {
1424
      for (j=0; j < width; j++) {
1425
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1426
      }
1427
      src += stride;
1428
      dst += stride;
1429
    }
1430
}
1431

    
1432
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433
    int i,j;
1434
    for (i=0; i < height; i++) {
1435
      for (j=0; j < width; j++) {
1436
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1437
      }
1438
      src += stride;
1439
      dst += stride;
1440
    }
1441
}
1442

    
1443
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444
    int i,j;
1445
    for (i=0; i < height; i++) {
1446
      for (j=0; j < width; j++) {
1447
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1448
      }
1449
      src += stride;
1450
      dst += stride;
1451
    }
1452
}
1453

    
1454
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455
    int i,j;
1456
    for (i=0; i < height; i++) {
1457
      for (j=0; j < width; j++) {
1458
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1459
      }
1460
      src += stride;
1461
      dst += stride;
1462
    }
1463
}
1464

    
1465
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1466
    int i,j;
1467
    for (i=0; i < height; i++) {
1468
      for (j=0; j < width; j++) {
1469
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1470
      }
1471
      src += stride;
1472
      dst += stride;
1473
    }
1474
}
1475
#if 0
1476
#define TPEL_WIDTH(width)\
1477
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1479
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1481
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1483
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1485
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1487
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1488
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1489
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1490
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1491
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1492
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1493
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1494
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1495
#endif
1496

    
1497
#define H264_CHROMA_MC(OPNAME, OP)\
1498
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1499
    const int A=(8-x)*(8-y);\
1500
    const int B=(  x)*(8-y);\
1501
    const int C=(8-x)*(  y);\
1502
    const int D=(  x)*(  y);\
1503
    int i;\
1504
    \
1505
    assert(x<8 && y<8 && x>=0 && y>=0);\
1506
\
1507
    if(D){\
1508
        for(i=0; i<h; i++){\
1509
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1510
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1511
            dst+= stride;\
1512
            src+= stride;\
1513
        }\
1514
    }else{\
1515
        const int E= B+C;\
1516
        const int step= C ? stride : 1;\
1517
        for(i=0; i<h; i++){\
1518
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1519
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1520
            dst+= stride;\
1521
            src+= stride;\
1522
        }\
1523
    }\
1524
}\
1525
\
1526
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1527
    const int A=(8-x)*(8-y);\
1528
    const int B=(  x)*(8-y);\
1529
    const int C=(8-x)*(  y);\
1530
    const int D=(  x)*(  y);\
1531
    int i;\
1532
    \
1533
    assert(x<8 && y<8 && x>=0 && y>=0);\
1534
\
1535
    if(D){\
1536
        for(i=0; i<h; i++){\
1537
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1538
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1539
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1540
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1541
            dst+= stride;\
1542
            src+= stride;\
1543
        }\
1544
    }else{\
1545
        const int E= B+C;\
1546
        const int step= C ? stride : 1;\
1547
        for(i=0; i<h; i++){\
1548
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1549
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1550
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1551
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1552
            dst+= stride;\
1553
            src+= stride;\
1554
        }\
1555
    }\
1556
}\
1557
\
1558
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1559
    const int A=(8-x)*(8-y);\
1560
    const int B=(  x)*(8-y);\
1561
    const int C=(8-x)*(  y);\
1562
    const int D=(  x)*(  y);\
1563
    int i;\
1564
    \
1565
    assert(x<8 && y<8 && x>=0 && y>=0);\
1566
\
1567
    if(D){\
1568
        for(i=0; i<h; i++){\
1569
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1570
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1571
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1572
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1573
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1574
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1575
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1576
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1577
            dst+= stride;\
1578
            src+= stride;\
1579
        }\
1580
    }else{\
1581
        const int E= B+C;\
1582
        const int step= C ? stride : 1;\
1583
        for(i=0; i<h; i++){\
1584
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1585
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1586
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1587
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1588
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1589
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1590
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1591
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1592
            dst+= stride;\
1593
            src+= stride;\
1594
        }\
1595
    }\
1596
}
1597

    
1598
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1599
#define op_put(a, b) a = (((b) + 32)>>6)
1600

    
1601
H264_CHROMA_MC(put_       , op_put)
1602
H264_CHROMA_MC(avg_       , op_avg)
1603
#undef op_avg
1604
#undef op_put
1605

    
1606
#define QPEL_MC(r, OPNAME, RND, OP) \
1607
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1608
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1609
    int i;\
1610
    for(i=0; i<h; i++)\
1611
    {\
1612
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1613
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1614
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1615
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1616
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1617
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1618
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1619
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1620
        dst+=dstStride;\
1621
        src+=srcStride;\
1622
    }\
1623
}\
1624
\
1625
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1626
    const int w=8;\
1627
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1628
    int i;\
1629
    for(i=0; i<w; i++)\
1630
    {\
1631
        const int src0= src[0*srcStride];\
1632
        const int src1= src[1*srcStride];\
1633
        const int src2= src[2*srcStride];\
1634
        const int src3= src[3*srcStride];\
1635
        const int src4= src[4*srcStride];\
1636
        const int src5= src[5*srcStride];\
1637
        const int src6= src[6*srcStride];\
1638
        const int src7= src[7*srcStride];\
1639
        const int src8= src[8*srcStride];\
1640
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1641
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1642
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1643
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1644
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1645
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1646
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1647
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1648
        dst++;\
1649
        src++;\
1650
    }\
1651
}\
1652
\
1653
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1654
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1655
    int i;\
1656
    \
1657
    for(i=0; i<h; i++)\
1658
    {\
1659
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1660
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1661
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1662
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1663
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1664
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1665
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1666
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1667
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1668
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1669
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1670
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1671
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1672
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1673
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1674
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1675
        dst+=dstStride;\
1676
        src+=srcStride;\
1677
    }\
1678
}\
1679
\
1680
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1681
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682
    int i;\
1683
    const int w=16;\
1684
    for(i=0; i<w; i++)\
1685
    {\
1686
        const int src0= src[0*srcStride];\
1687
        const int src1= src[1*srcStride];\
1688
        const int src2= src[2*srcStride];\
1689
        const int src3= src[3*srcStride];\
1690
        const int src4= src[4*srcStride];\
1691
        const int src5= src[5*srcStride];\
1692
        const int src6= src[6*srcStride];\
1693
        const int src7= src[7*srcStride];\
1694
        const int src8= src[8*srcStride];\
1695
        const int src9= src[9*srcStride];\
1696
        const int src10= src[10*srcStride];\
1697
        const int src11= src[11*srcStride];\
1698
        const int src12= src[12*srcStride];\
1699
        const int src13= src[13*srcStride];\
1700
        const int src14= src[14*srcStride];\
1701
        const int src15= src[15*srcStride];\
1702
        const int src16= src[16*srcStride];\
1703
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1704
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1705
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1706
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1707
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1708
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1709
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1710
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1711
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1712
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1713
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1714
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1715
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1716
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1717
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1718
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1719
        dst++;\
1720
        src++;\
1721
    }\
1722
}\
1723
\
1724
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1725
    uint8_t half[64];\
1726
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1727
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1728
}\
1729
\
1730
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1731
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1732
}\
1733
\
1734
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1735
    uint8_t half[64];\
1736
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1737
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1738
}\
1739
\
1740
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1741
    uint8_t full[16*9];\
1742
    uint8_t half[64];\
1743
    copy_block9(full, src, 16, stride, 9);\
1744
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1745
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1746
}\
1747
\
1748
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1749
    uint8_t full[16*9];\
1750
    copy_block9(full, src, 16, stride, 9);\
1751
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1752
}\
1753
\
1754
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1755
    uint8_t full[16*9];\
1756
    uint8_t half[64];\
1757
    copy_block9(full, src, 16, stride, 9);\
1758
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1759
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1760
}\
1761
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1762
    uint8_t full[16*9];\
1763
    uint8_t halfH[72];\
1764
    uint8_t halfV[64];\
1765
    uint8_t halfHV[64];\
1766
    copy_block9(full, src, 16, stride, 9);\
1767
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1768
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1769
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1771
}\
1772
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1773
    uint8_t full[16*9];\
1774
    uint8_t halfH[72];\
1775
    uint8_t halfHV[64];\
1776
    copy_block9(full, src, 16, stride, 9);\
1777
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1778
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1779
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1781
}\
1782
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783
    uint8_t full[16*9];\
1784
    uint8_t halfH[72];\
1785
    uint8_t halfV[64];\
1786
    uint8_t halfHV[64];\
1787
    copy_block9(full, src, 16, stride, 9);\
1788
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1790
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1792
}\
1793
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1794
    uint8_t full[16*9];\
1795
    uint8_t halfH[72];\
1796
    uint8_t halfHV[64];\
1797
    copy_block9(full, src, 16, stride, 9);\
1798
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1800
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1802
}\
1803
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1804
    uint8_t full[16*9];\
1805
    uint8_t halfH[72];\
1806
    uint8_t halfV[64];\
1807
    uint8_t halfHV[64];\
1808
    copy_block9(full, src, 16, stride, 9);\
1809
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1810
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1811
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1813
}\
1814
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1815
    uint8_t full[16*9];\
1816
    uint8_t halfH[72];\
1817
    uint8_t halfHV[64];\
1818
    copy_block9(full, src, 16, stride, 9);\
1819
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1821
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1823
}\
1824
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825
    uint8_t full[16*9];\
1826
    uint8_t halfH[72];\
1827
    uint8_t halfV[64];\
1828
    uint8_t halfHV[64];\
1829
    copy_block9(full, src, 16, stride, 9);\
1830
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1831
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1832
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1834
}\
1835
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1836
    uint8_t full[16*9];\
1837
    uint8_t halfH[72];\
1838
    uint8_t halfHV[64];\
1839
    copy_block9(full, src, 16, stride, 9);\
1840
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1842
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1843
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1844
}\
1845
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1846
    uint8_t halfH[72];\
1847
    uint8_t halfHV[64];\
1848
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1849
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1850
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1851
}\
1852
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1853
    uint8_t halfH[72];\
1854
    uint8_t halfHV[64];\
1855
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1856
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1858
}\
1859
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[16*9];\
1861
    uint8_t halfH[72];\
1862
    uint8_t halfV[64];\
1863
    uint8_t halfHV[64];\
1864
    copy_block9(full, src, 16, stride, 9);\
1865
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1867
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1869
}\
1870
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1871
    uint8_t full[16*9];\
1872
    uint8_t halfH[72];\
1873
    copy_block9(full, src, 16, stride, 9);\
1874
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1875
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1876
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1877
}\
1878
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879
    uint8_t full[16*9];\
1880
    uint8_t halfH[72];\
1881
    uint8_t halfV[64];\
1882
    uint8_t halfHV[64];\
1883
    copy_block9(full, src, 16, stride, 9);\
1884
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1885
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1886
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1887
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1888
}\
1889
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1890
    uint8_t full[16*9];\
1891
    uint8_t halfH[72];\
1892
    copy_block9(full, src, 16, stride, 9);\
1893
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1894
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1895
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1896
}\
1897
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t halfH[72];\
1899
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1900
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1901
}\
1902
\
1903
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1904
    uint8_t half[256];\
1905
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1906
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1907
}\
1908
\
1909
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1910
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1911
}\
1912
\
1913
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1914
    uint8_t half[256];\
1915
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1916
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1917
}\
1918
\
1919
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1920
    uint8_t full[24*17];\
1921
    uint8_t half[256];\
1922
    copy_block17(full, src, 24, stride, 17);\
1923
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1924
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1925
}\
1926
\
1927
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1928
    uint8_t full[24*17];\
1929
    copy_block17(full, src, 24, stride, 17);\
1930
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1931
}\
1932
\
1933
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1934
    uint8_t full[24*17];\
1935
    uint8_t half[256];\
1936
    copy_block17(full, src, 24, stride, 17);\
1937
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1938
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1939
}\
1940
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1941
    uint8_t full[24*17];\
1942
    uint8_t halfH[272];\
1943
    uint8_t halfV[256];\
1944
    uint8_t halfHV[256];\
1945
    copy_block17(full, src, 24, stride, 17);\
1946
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1948
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1950
}\
1951
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1952
    uint8_t full[24*17];\
1953
    uint8_t halfH[272];\
1954
    uint8_t halfHV[256];\
1955
    copy_block17(full, src, 24, stride, 17);\
1956
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1958
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1960
}\
1961
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962
    uint8_t full[24*17];\
1963
    uint8_t halfH[272];\
1964
    uint8_t halfV[256];\
1965
    uint8_t halfHV[256];\
1966
    copy_block17(full, src, 24, stride, 17);\
1967
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1969
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1971
}\
1972
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1973
    uint8_t full[24*17];\
1974
    uint8_t halfH[272];\
1975
    uint8_t halfHV[256];\
1976
    copy_block17(full, src, 24, stride, 17);\
1977
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1978
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1979
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1981
}\
1982
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983
    uint8_t full[24*17];\
1984
    uint8_t halfH[272];\
1985
    uint8_t halfV[256];\
1986
    uint8_t halfHV[256];\
1987
    copy_block17(full, src, 24, stride, 17);\
1988
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1990
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1992
}\
1993
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1994
    uint8_t full[24*17];\
1995
    uint8_t halfH[272];\
1996
    uint8_t halfHV[256];\
1997
    copy_block17(full, src, 24, stride, 17);\
1998
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2000
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002
}\
2003
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004
    uint8_t full[24*17];\
2005
    uint8_t halfH[272];\
2006
    uint8_t halfV[256];\
2007
    uint8_t halfHV[256];\
2008
    copy_block17(full, src, 24, stride, 17);\
2009
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2010
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2011
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2013
}\
2014
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2015
    uint8_t full[24*17];\
2016
    uint8_t halfH[272];\
2017
    uint8_t halfHV[256];\
2018
    copy_block17(full, src, 24, stride, 17);\
2019
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2020
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2021
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2022
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2023
}\
2024
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2025
    uint8_t halfH[272];\
2026
    uint8_t halfHV[256];\
2027
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2028
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2030
}\
2031
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2032
    uint8_t halfH[272];\
2033
    uint8_t halfHV[256];\
2034
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2035
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2037
}\
2038
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2039
    uint8_t full[24*17];\
2040
    uint8_t halfH[272];\
2041
    uint8_t halfV[256];\
2042
    uint8_t halfHV[256];\
2043
    copy_block17(full, src, 24, stride, 17);\
2044
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2045
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2046
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2047
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2048
}\
2049
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2050
    uint8_t full[24*17];\
2051
    uint8_t halfH[272];\
2052
    copy_block17(full, src, 24, stride, 17);\
2053
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2055
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2056
}\
2057
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2058
    uint8_t full[24*17];\
2059
    uint8_t halfH[272];\
2060
    uint8_t halfV[256];\
2061
    uint8_t halfHV[256];\
2062
    copy_block17(full, src, 24, stride, 17);\
2063
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2065
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2067
}\
2068
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2069
    uint8_t full[24*17];\
2070
    uint8_t halfH[272];\
2071
    copy_block17(full, src, 24, stride, 17);\
2072
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2073
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2074
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2075
}\
2076
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2077
    uint8_t halfH[272];\
2078
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2079
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2080
}
2081

    
2082
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2083
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2084
#define op_put(a, b) a = cm[((b) + 16)>>5]
2085
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2086

    
2087
QPEL_MC(0, put_       , _       , op_put)
2088
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2089
QPEL_MC(0, avg_       , _       , op_avg)
2090
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2091
#undef op_avg
2092
#undef op_avg_no_rnd
2093
#undef op_put
2094
#undef op_put_no_rnd
2095

    
2096
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
2097
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2098
#define put_qpel16_mc00_c ff_put_pixels16x16_c
2099
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2100
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2101
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2102

    
2103
#if 1
2104
#define H264_LOWPASS(OPNAME, OP, OP2) \
2105
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2106
    const int h=2;\
2107
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2108
    int i;\
2109
    for(i=0; i<h; i++)\
2110
    {\
2111
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2112
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2113
        dst+=dstStride;\
2114
        src+=srcStride;\
2115
    }\
2116
}\
2117
\
2118
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2119
    const int w=2;\
2120
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2121
    int i;\
2122
    for(i=0; i<w; i++)\
2123
    {\
2124
        const int srcB= src[-2*srcStride];\
2125
        const int srcA= src[-1*srcStride];\
2126
        const int src0= src[0 *srcStride];\
2127
        const int src1= src[1 *srcStride];\
2128
        const int src2= src[2 *srcStride];\
2129
        const int src3= src[3 *srcStride];\
2130
        const int src4= src[4 *srcStride];\
2131
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2133
        dst++;\
2134
        src++;\
2135
    }\
2136
}\
2137
\
2138
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2139
    const int h=2;\
2140
    const int w=2;\
2141
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2142
    int i;\
2143
    src -= 2*srcStride;\
2144
    for(i=0; i<h+5; i++)\
2145
    {\
2146
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2147
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2148
        tmp+=tmpStride;\
2149
        src+=srcStride;\
2150
    }\
2151
    tmp -= tmpStride*(h+5-2);\
2152
    for(i=0; i<w; i++)\
2153
    {\
2154
        const int tmpB= tmp[-2*tmpStride];\
2155
        const int tmpA= tmp[-1*tmpStride];\
2156
        const int tmp0= tmp[0 *tmpStride];\
2157
        const int tmp1= tmp[1 *tmpStride];\
2158
        const int tmp2= tmp[2 *tmpStride];\
2159
        const int tmp3= tmp[3 *tmpStride];\
2160
        const int tmp4= tmp[4 *tmpStride];\
2161
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2162
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2163
        dst++;\
2164
        tmp++;\
2165
    }\
2166
}\
2167
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2168
    const int h=4;\
2169
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170
    int i;\
2171
    for(i=0; i<h; i++)\
2172
    {\
2173
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2174
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2175
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2176
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2177
        dst+=dstStride;\
2178
        src+=srcStride;\
2179
    }\
2180
}\
2181
\
2182
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2183
    const int w=4;\
2184
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2185
    int i;\
2186
    for(i=0; i<w; i++)\
2187
    {\
2188
        const int srcB= src[-2*srcStride];\
2189
        const int srcA= src[-1*srcStride];\
2190
        const int src0= src[0 *srcStride];\
2191
        const int src1= src[1 *srcStride];\
2192
        const int src2= src[2 *srcStride];\
2193
        const int src3= src[3 *srcStride];\
2194
        const int src4= src[4 *srcStride];\
2195
        const int src5= src[5 *srcStride];\
2196
        const int src6= src[6 *srcStride];\
2197
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2198
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2199
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2200
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2201
        dst++;\
2202
        src++;\
2203
    }\
2204
}\
2205
\
2206
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2207
    const int h=4;\
2208
    const int w=4;\
2209
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2210
    int i;\
2211
    src -= 2*srcStride;\
2212
    for(i=0; i<h+5; i++)\
2213
    {\
2214
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2215
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2216
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2217
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2218
        tmp+=tmpStride;\
2219
        src+=srcStride;\
2220
    }\
2221
    tmp -= tmpStride*(h+5-2);\
2222
    for(i=0; i<w; i++)\
2223
    {\
2224
        const int tmpB= tmp[-2*tmpStride];\
2225
        const int tmpA= tmp[-1*tmpStride];\
2226
        const int tmp0= tmp[0 *tmpStride];\
2227
        const int tmp1= tmp[1 *tmpStride];\
2228
        const int tmp2= tmp[2 *tmpStride];\
2229
        const int tmp3= tmp[3 *tmpStride];\
2230
        const int tmp4= tmp[4 *tmpStride];\
2231
        const int tmp5= tmp[5 *tmpStride];\
2232
        const int tmp6= tmp[6 *tmpStride];\
2233
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2234
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2235
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2236
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2237
        dst++;\
2238
        tmp++;\
2239
    }\
2240
}\
2241
\
2242
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2243
    const int h=8;\
2244
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245
    int i;\
2246
    for(i=0; i<h; i++)\
2247
    {\
2248
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2249
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2250
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2251
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2252
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2253
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2254
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2255
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2256
        dst+=dstStride;\
2257
        src+=srcStride;\
2258
    }\
2259
}\
2260
\
2261
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2262
    const int w=8;\
2263
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264
    int i;\
2265
    for(i=0; i<w; i++)\
2266
    {\
2267
        const int srcB= src[-2*srcStride];\
2268
        const int srcA= src[-1*srcStride];\
2269
        const int src0= src[0 *srcStride];\
2270
        const int src1= src[1 *srcStride];\
2271
        const int src2= src[2 *srcStride];\
2272
        const int src3= src[3 *srcStride];\
2273
        const int src4= src[4 *srcStride];\
2274
        const int src5= src[5 *srcStride];\
2275
        const int src6= src[6 *srcStride];\
2276
        const int src7= src[7 *srcStride];\
2277
        const int src8= src[8 *srcStride];\
2278
        const int src9= src[9 *srcStride];\
2279
        const int src10=src[10*srcStride];\
2280
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2281
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2282
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2283
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2284
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2285
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2286
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2287
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2288
        dst++;\
2289
        src++;\
2290
    }\
2291
}\
2292
\
2293
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2294
    const int h=8;\
2295
    const int w=8;\
2296
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2297
    int i;\
2298
    src -= 2*srcStride;\
2299
    for(i=0; i<h+5; i++)\
2300
    {\
2301
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2302
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2303
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2304
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2305
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2306
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2307
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2308
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2309
        tmp+=tmpStride;\
2310
        src+=srcStride;\
2311
    }\
2312
    tmp -= tmpStride*(h+5-2);\
2313
    for(i=0; i<w; i++)\
2314
    {\
2315
        const int tmpB= tmp[-2*tmpStride];\
2316
        const int tmpA= tmp[-1*tmpStride];\
2317
        const int tmp0= tmp[0 *tmpStride];\
2318
        const int tmp1= tmp[1 *tmpStride];\
2319
        const int tmp2= tmp[2 *tmpStride];\
2320
        const int tmp3= tmp[3 *tmpStride];\
2321
        const int tmp4= tmp[4 *tmpStride];\
2322
        const int tmp5= tmp[5 *tmpStride];\
2323
        const int tmp6= tmp[6 *tmpStride];\
2324
        const int tmp7= tmp[7 *tmpStride];\
2325
        const int tmp8= tmp[8 *tmpStride];\
2326
        const int tmp9= tmp[9 *tmpStride];\
2327
        const int tmp10=tmp[10*tmpStride];\
2328
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2329
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2330
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2331
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2332
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2333
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2334
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2335
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2336
        dst++;\
2337
        tmp++;\
2338
    }\
2339
}\
2340
\
2341
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2343
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2344
    src += 8*srcStride;\
2345
    dst += 8*dstStride;\
2346
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2347
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2348
}\
2349
\
2350
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2351
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2352
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2353
    src += 8*srcStride;\
2354
    dst += 8*dstStride;\
2355
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2356
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2357
}\
2358
\
2359
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2360
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2361
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2362
    src += 8*srcStride;\
2363
    dst += 8*dstStride;\
2364
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2365
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2366
}\
2367

    
2368
#define H264_MC(OPNAME, SIZE) \
2369
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2370
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2371
}\
2372
\
2373
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2374
    uint8_t half[SIZE*SIZE];\
2375
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2376
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2377
}\
2378
\
2379
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2380
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2381
}\
2382
\
2383
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2384
    uint8_t half[SIZE*SIZE];\
2385
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2386
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2387
}\
2388
\
2389
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2390
    uint8_t full[SIZE*(SIZE+5)];\
2391
    uint8_t * const full_mid= full + SIZE*2;\
2392
    uint8_t half[SIZE*SIZE];\
2393
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2394
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2395
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2396
}\
2397
\
2398
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2399
    uint8_t full[SIZE*(SIZE+5)];\
2400
    uint8_t * const full_mid= full + SIZE*2;\
2401
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2402
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2403
}\
2404
\
2405
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2406
    uint8_t full[SIZE*(SIZE+5)];\
2407
    uint8_t * const full_mid= full + SIZE*2;\
2408
    uint8_t half[SIZE*SIZE];\
2409
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2410
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2411
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2412
}\
2413
\
2414
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2415
    uint8_t full[SIZE*(SIZE+5)];\
2416
    uint8_t * const full_mid= full + SIZE*2;\
2417
    uint8_t halfH[SIZE*SIZE];\
2418
    uint8_t halfV[SIZE*SIZE];\
2419
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2420
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2421
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2422
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2423
}\
2424
\
2425
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2426
    uint8_t full[SIZE*(SIZE+5)];\
2427
    uint8_t * const full_mid= full + SIZE*2;\
2428
    uint8_t halfH[SIZE*SIZE];\
2429
    uint8_t halfV[SIZE*SIZE];\
2430
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2431
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2432
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2433
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2434
}\
2435
\
2436
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2437
    uint8_t full[SIZE*(SIZE+5)];\
2438
    uint8_t * const full_mid= full + SIZE*2;\
2439
    uint8_t halfH[SIZE*SIZE];\
2440
    uint8_t halfV[SIZE*SIZE];\
2441
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2442
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2443
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2444
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2445
}\
2446
\
2447
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2448
    uint8_t full[SIZE*(SIZE+5)];\
2449
    uint8_t * const full_mid= full + SIZE*2;\
2450
    uint8_t halfH[SIZE*SIZE];\
2451
    uint8_t halfV[SIZE*SIZE];\
2452
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2453
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2454
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2455
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2456
}\
2457
\
2458
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2459
    int16_t tmp[SIZE*(SIZE+5)];\
2460
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2461
}\
2462
\
2463
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2464
    int16_t tmp[SIZE*(SIZE+5)];\
2465
    uint8_t halfH[SIZE*SIZE];\
2466
    uint8_t halfHV[SIZE*SIZE];\
2467
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2468
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2469
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2470
}\
2471
\
2472
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2473
    int16_t tmp[SIZE*(SIZE+5)];\
2474
    uint8_t halfH[SIZE*SIZE];\
2475
    uint8_t halfHV[SIZE*SIZE];\
2476
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2477
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2478
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2479
}\
2480
\
2481
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2482
    uint8_t full[SIZE*(SIZE+5)];\
2483
    uint8_t * const full_mid= full + SIZE*2;\
2484
    int16_t tmp[SIZE*(SIZE+5)];\
2485
    uint8_t halfV[SIZE*SIZE];\
2486
    uint8_t halfHV[SIZE*SIZE];\
2487
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2488
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2489
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2490
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2491
}\
2492
\
2493
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2494
    uint8_t full[SIZE*(SIZE+5)];\
2495
    uint8_t * const full_mid= full + SIZE*2;\
2496
    int16_t tmp[SIZE*(SIZE+5)];\
2497
    uint8_t halfV[SIZE*SIZE];\
2498
    uint8_t halfHV[SIZE*SIZE];\
2499
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2500
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2501
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2502
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2503
}\
2504

    
2505
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2506
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2507
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2508
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2509
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2510

    
2511
H264_LOWPASS(put_       , op_put, op2_put)
2512
H264_LOWPASS(avg_       , op_avg, op2_avg)
2513
H264_MC(put_, 2)
2514
H264_MC(put_, 4)
2515
H264_MC(put_, 8)
2516
H264_MC(put_, 16)
2517
H264_MC(avg_, 4)
2518
H264_MC(avg_, 8)
2519
H264_MC(avg_, 16)
2520

    
2521
#undef op_avg
2522
#undef op_put
2523
#undef op2_avg
2524
#undef op2_put
2525
#endif
2526

    
2527
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2528
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2529
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2530
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2531

    
2532
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2533
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2534
    int i;
2535

    
2536
    for(i=0; i<h; i++){
2537
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2538
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2539
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2540
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2541
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2542
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2543
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2544
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2545
        dst+=dstStride;
2546
        src+=srcStride;
2547
    }
2548
}
2549

    
2550
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2551
    put_pixels8_c(dst, src, stride, 8);
2552
}
2553
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2554
    avg_pixels8_c(dst, src, stride, 8);
2555
}
2556
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2557
    put_pixels16_c(dst, src, stride, 16);
2558
}
2559
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2560
    avg_pixels16_c(dst, src, stride, 16);
2561
}
2562

    
2563
#if CONFIG_RV40_DECODER
2564
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2565
    put_pixels16_xy2_c(dst, src, stride, 16);
2566
}
2567
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568
    avg_pixels16_xy2_c(dst, src, stride, 16);
2569
}
2570
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571
    put_pixels8_xy2_c(dst, src, stride, 8);
2572
}
2573
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574
    avg_pixels8_xy2_c(dst, src, stride, 8);
2575
}
2576
#endif /* CONFIG_RV40_DECODER */
2577

    
2578
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2579
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2580
    int i;
2581

    
2582
    for(i=0; i<w; i++){
2583
        const int src_1= src[ -srcStride];
2584
        const int src0 = src[0          ];
2585
        const int src1 = src[  srcStride];
2586
        const int src2 = src[2*srcStride];
2587
        const int src3 = src[3*srcStride];
2588
        const int src4 = src[4*srcStride];
2589
        const int src5 = src[5*srcStride];
2590
        const int src6 = src[6*srcStride];
2591
        const int src7 = src[7*srcStride];
2592
        const int src8 = src[8*srcStride];
2593
        const int src9 = src[9*srcStride];
2594
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2595
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2596
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2597
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2598
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2599
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2600
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2601
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2602
        src++;
2603
        dst++;
2604
    }
2605
}
2606

    
2607
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2608
    uint8_t half[64];
2609
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2610
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2611
}
2612

    
2613
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2614
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2615
}
2616

    
2617
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2618
    uint8_t half[64];
2619
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2620
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2621
}
2622

    
2623
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2624
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2625
}
2626

    
2627
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2628
    uint8_t halfH[88];
2629
    uint8_t halfV[64];
2630
    uint8_t halfHV[64];
2631
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2632
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2633
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2634
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2635
}
2636
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2637
    uint8_t halfH[88];
2638
    uint8_t halfV[64];
2639
    uint8_t halfHV[64];
2640
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2641
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2642
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2643
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2644
}
2645
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2646
    uint8_t halfH[88];
2647
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2648
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2649
}
2650

    
2651
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2652
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2653
    int x;
2654
    const int strength= ff_h263_loop_filter_strength[qscale];
2655

    
2656
    for(x=0; x<8; x++){
2657
        int d1, d2, ad1;
2658
        int p0= src[x-2*stride];
2659
        int p1= src[x-1*stride];
2660
        int p2= src[x+0*stride];
2661
        int p3= src[x+1*stride];
2662
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2663

    
2664
        if     (d<-2*strength) d1= 0;
2665
        else if(d<-  strength) d1=-2*strength - d;
2666
        else if(d<   strength) d1= d;
2667
        else if(d< 2*strength) d1= 2*strength - d;
2668
        else                   d1= 0;
2669

    
2670
        p1 += d1;
2671
        p2 -= d1;
2672
        if(p1&256) p1= ~(p1>>31);
2673
        if(p2&256) p2= ~(p2>>31);
2674

    
2675
        src[x-1*stride] = p1;
2676
        src[x+0*stride] = p2;
2677

    
2678
        ad1= FFABS(d1)>>1;
2679

    
2680
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2681

    
2682
        src[x-2*stride] = p0 - d2;
2683
        src[x+  stride] = p3 + d2;
2684
    }
2685
    }
2686
}
2687

    
2688
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2689
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2690
    int y;
2691
    const int strength= ff_h263_loop_filter_strength[qscale];
2692

    
2693
    for(y=0; y<8; y++){
2694
        int d1, d2, ad1;
2695
        int p0= src[y*stride-2];
2696
        int p1= src[y*stride-1];
2697
        int p2= src[y*stride+0];
2698
        int p3= src[y*stride+1];
2699
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700

    
2701
        if     (d<-2*strength) d1= 0;
2702
        else if(d<-  strength) d1=-2*strength - d;
2703
        else if(d<   strength) d1= d;
2704
        else if(d< 2*strength) d1= 2*strength - d;
2705
        else                   d1= 0;
2706

    
2707
        p1 += d1;
2708
        p2 -= d1;
2709
        if(p1&256) p1= ~(p1>>31);
2710
        if(p2&256) p2= ~(p2>>31);
2711

    
2712
        src[y*stride-1] = p1;
2713
        src[y*stride+0] = p2;
2714

    
2715
        ad1= FFABS(d1)>>1;
2716

    
2717
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2718

    
2719
        src[y*stride-2] = p0 - d2;
2720
        src[y*stride+1] = p3 + d2;
2721
    }
2722
    }
2723
}
2724

    
2725
static void h261_loop_filter_c(uint8_t *src, int stride){
2726
    int x,y,xy,yz;
2727
    int temp[64];
2728

    
2729
    for(x=0; x<8; x++){
2730
        temp[x      ] = 4*src[x           ];
2731
        temp[x + 7*8] = 4*src[x + 7*stride];
2732
    }
2733
    for(y=1; y<7; y++){
2734
        for(x=0; x<8; x++){
2735
            xy = y * stride + x;
2736
            yz = y * 8 + x;
2737
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2738
        }
2739
    }
2740

    
2741
    for(y=0; y<8; y++){
2742
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2743
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2744
        for(x=1; x<7; x++){
2745
            xy = y * stride + x;
2746
            yz = y * 8 + x;
2747
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2748
        }
2749
    }
2750
}
2751

    
2752
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2753
{
2754
    int s, i;
2755

    
2756
    s = 0;
2757
    for(i=0;i<h;i++) {
2758
        s += abs(pix1[0] - pix2[0]);
2759
        s += abs(pix1[1] - pix2[1]);
2760
        s += abs(pix1[2] - pix2[2]);
2761
        s += abs(pix1[3] - pix2[3]);
2762
        s += abs(pix1[4] - pix2[4]);
2763
        s += abs(pix1[5] - pix2[5]);
2764
        s += abs(pix1[6] - pix2[6]);
2765
        s += abs(pix1[7] - pix2[7]);
2766
        s += abs(pix1[8] - pix2[8]);
2767
        s += abs(pix1[9] - pix2[9]);
2768
        s += abs(pix1[10] - pix2[10]);
2769
        s += abs(pix1[11] - pix2[11]);
2770
        s += abs(pix1[12] - pix2[12]);
2771
        s += abs(pix1[13] - pix2[13]);
2772
        s += abs(pix1[14] - pix2[14]);
2773
        s += abs(pix1[15] - pix2[15]);
2774
        pix1 += line_size;
2775
        pix2 += line_size;
2776
    }
2777
    return s;
2778
}
2779

    
2780
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2781
{
2782
    int s, i;
2783

    
2784
    s = 0;
2785
    for(i=0;i<h;i++) {
2786
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2787
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2788
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2789
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2790
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2791
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2792
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2793
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2794
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2795
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2796
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2797
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2798
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2799
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2800
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2801
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2802
        pix1 += line_size;
2803
        pix2 += line_size;
2804
    }
2805
    return s;
2806
}
2807

    
2808
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2809
{
2810
    int s, i;
2811
    uint8_t *pix3 = pix2 + line_size;
2812

    
2813
    s = 0;
2814
    for(i=0;i<h;i++) {
2815
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2816
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2817
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2818
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2819
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2820
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2821
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2822
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2823
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2824
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2825
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2826
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2827
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2828
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2829
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2830
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2831
        pix1 += line_size;
2832
        pix2 += line_size;
2833
        pix3 += line_size;
2834
    }
2835
    return s;
2836
}
2837

    
2838
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2839
{
2840
    int s, i;
2841
    uint8_t *pix3 = pix2 + line_size;
2842

    
2843
    s = 0;
2844
    for(i=0;i<h;i++) {
2845
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2846
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2847
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2848
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2849
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2850
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2851
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2852
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2853
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2854
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2855
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2856
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2857
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2858
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2859
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2860
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2861
        pix1 += line_size;
2862
        pix2 += line_size;
2863
        pix3 += line_size;
2864
    }
2865
    return s;
2866
}
2867

    
2868
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2869
{
2870
    int s, i;
2871

    
2872
    s = 0;
2873
    for(i=0;i<h;i++) {
2874
        s += abs(pix1[0] - pix2[0]);
2875
        s += abs(pix1[1] - pix2[1]);
2876
        s += abs(pix1[2] - pix2[2]);
2877
        s += abs(pix1[3] - pix2[3]);
2878
        s += abs(pix1[4] - pix2[4]);
2879
        s += abs(pix1[5] - pix2[5]);
2880
        s += abs(pix1[6] - pix2[6]);
2881
        s += abs(pix1[7] - pix2[7]);
2882
        pix1 += line_size;
2883
        pix2 += line_size;
2884
    }
2885
    return s;
2886
}
2887

    
2888
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2889
{
2890
    int s, i;
2891

    
2892
    s = 0;
2893
    for(i=0;i<h;i++) {
2894
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2895
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2896
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2897
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2898
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2899
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2900
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2901
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2902
        pix1 += line_size;
2903
        pix2 += line_size;
2904
    }
2905
    return s;
2906
}
2907

    
2908
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2909
{
2910
    int s, i;
2911
    uint8_t *pix3 = pix2 + line_size;
2912

    
2913
    s = 0;
2914
    for(i=0;i<h;i++) {
2915
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2916
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2917
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2918
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2919
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2920
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2921
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2922
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2923
        pix1 += line_size;
2924
        pix2 += line_size;
2925
        pix3 += line_size;
2926
    }
2927
    return s;
2928
}
2929

    
2930
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2931
{
2932
    int s, i;
2933
    uint8_t *pix3 = pix2 + line_size;
2934

    
2935
    s = 0;
2936
    for(i=0;i<h;i++) {
2937
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2938
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2939
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2940
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2941
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2942
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2943
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2944
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2945
        pix1 += line_size;
2946
        pix2 += line_size;
2947
        pix3 += line_size;
2948
    }
2949
    return s;
2950
}
2951

    
2952
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2953
    MpegEncContext *c = v;
2954
    int score1=0;
2955
    int score2=0;
2956
    int x,y;
2957

    
2958
    for(y=0; y<h; y++){
2959
        for(x=0; x<16; x++){
2960
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2961
        }
2962
        if(y+1<h){
2963
            for(x=0; x<15; x++){
2964
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2965
                             - s1[x+1] + s1[x+1+stride])
2966
                        -FFABS(  s2[x  ] - s2[x  +stride]
2967
                             - s2[x+1] + s2[x+1+stride]);
2968
            }
2969
        }
2970
        s1+= stride;
2971
        s2+= stride;
2972
    }
2973

    
2974
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2975
    else  return score1 + FFABS(score2)*8;
2976
}
2977

    
2978
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2979
    MpegEncContext *c = v;
2980
    int score1=0;
2981
    int score2=0;
2982
    int x,y;
2983

    
2984
    for(y=0; y<h; y++){
2985
        for(x=0; x<8; x++){
2986
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2987
        }
2988
        if(y+1<h){
2989
            for(x=0; x<7; x++){
2990
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2991
                             - s1[x+1] + s1[x+1+stride])
2992
                        -FFABS(  s2[x  ] - s2[x  +stride]
2993
                             - s2[x+1] + s2[x+1+stride]);
2994
            }
2995
        }
2996
        s1+= stride;
2997
        s2+= stride;
2998
    }
2999

    
3000
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3001
    else  return score1 + FFABS(score2)*8;
3002
}
3003

    
3004
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3005
    int i;
3006
    unsigned int sum=0;
3007

    
3008
    for(i=0; i<8*8; i++){
3009
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3010
        int w= weight[i];
3011
        b>>= RECON_SHIFT;
3012
        assert(-512<b && b<512);
3013

    
3014
        sum += (w*b)*(w*b)>>4;
3015
    }
3016
    return sum>>2;
3017
}
3018

    
3019
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3020
    int i;
3021

    
3022
    for(i=0; i<8*8; i++){
3023
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3024
    }
3025
}
3026

    
3027
/**
3028
 * permutes an 8x8 block.
3029
 * @param block the block which will be permuted according to the given permutation vector
3030
 * @param permutation the permutation vector
3031
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3032
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3033
 *                  (inverse) permutated to scantable order!
3034
 */
3035
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3036
{
3037
    int i;
3038
    DCTELEM temp[64];
3039

    
3040
    if(last<=0) return;
3041
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3042

    
3043
    for(i=0; i<=last; i++){
3044
        const int j= scantable[i];
3045
        temp[j]= block[j];
3046
        block[j]=0;
3047
    }
3048

    
3049
    for(i=0; i<=last; i++){
3050
        const int j= scantable[i];
3051
        const int perm_j= permutation[j];
3052
        block[perm_j]= temp[j];
3053
    }
3054
}
3055

    
3056
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3057
    return 0;
3058
}
3059

    
3060
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3061
    int i;
3062

    
3063
    memset(cmp, 0, sizeof(void*)*6);
3064

    
3065
    for(i=0; i<6; i++){
3066
        switch(type&0xFF){
3067
        case FF_CMP_SAD:
3068
            cmp[i]= c->sad[i];
3069
            break;
3070
        case FF_CMP_SATD:
3071
            cmp[i]= c->hadamard8_diff[i];
3072
            break;
3073
        case FF_CMP_SSE:
3074
            cmp[i]= c->sse[i];
3075
            break;
3076
        case FF_CMP_DCT:
3077
            cmp[i]= c->dct_sad[i];
3078
            break;
3079
        case FF_CMP_DCT264:
3080
            cmp[i]= c->dct264_sad[i];
3081
            break;
3082
        case FF_CMP_DCTMAX:
3083
            cmp[i]= c->dct_max[i];
3084
            break;
3085
        case FF_CMP_PSNR:
3086
            cmp[i]= c->quant_psnr[i];
3087
            break;
3088
        case FF_CMP_BIT:
3089
            cmp[i]= c->bit[i];
3090
            break;
3091
        case FF_CMP_RD:
3092
            cmp[i]= c->rd[i];
3093
            break;
3094
        case FF_CMP_VSAD:
3095
            cmp[i]= c->vsad[i];
3096
            break;
3097
        case FF_CMP_VSSE:
3098
            cmp[i]= c->vsse[i];
3099
            break;
3100
        case FF_CMP_ZERO:
3101
            cmp[i]= zero_cmp;
3102
            break;
3103
        case FF_CMP_NSSE:
3104
            cmp[i]= c->nsse[i];
3105
            break;
3106
#if CONFIG_DWT
3107
        case FF_CMP_W53:
3108
            cmp[i]= c->w53[i];
3109
            break;
3110
        case FF_CMP_W97:
3111
            cmp[i]= c->w97[i];
3112
            break;
3113
#endif
3114
        default:
3115
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3116
        }
3117
    }
3118
}
3119

    
3120
static void clear_block_c(DCTELEM *block)
3121
{
3122
    memset(block, 0, sizeof(DCTELEM)*64);
3123
}
3124

    
3125
/**
3126
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3127
 */
3128
static void clear_blocks_c(DCTELEM *blocks)
3129
{
3130
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3131
}
3132

    
3133
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3134
    long i;
3135
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3136
        long a = *(long*)(src+i);
3137
        long b = *(long*)(dst+i);
3138
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3139
    }
3140
    for(; i<w; i++)
3141
        dst[i+0] += src[i+0];
3142
}
3143

    
3144
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3145
    long i;
3146
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3147
        long a = *(long*)(src1+i);
3148
        long b = *(long*)(src2+i);
3149
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3150
    }
3151
    for(; i<w; i++)
3152
        dst[i] = src1[i]+src2[i];
3153
}
3154

    
3155
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3156
    long i;
3157
#if !HAVE_FAST_UNALIGNED
3158
    if((long)src2 & (sizeof(long)-1)){
3159
        for(i=0; i+7<w; i+=8){
3160
            dst[i+0] = src1[i+0]-src2[i+0];
3161
            dst[i+1] = src1[i+1]-src2[i+1];
3162
            dst[i+2] = src1[i+2]-src2[i+2];
3163
            dst[i+3] = src1[i+3]-src2[i+3];
3164
            dst[i+4] = src1[i+4]-src2[i+4];
3165
            dst[i+5] = src1[i+5]-src2[i+5];
3166
            dst[i+6] = src1[i+6]-src2[i+6];
3167
            dst[i+7] = src1[i+7]-src2[i+7];
3168
        }
3169
    }else
3170
#endif
3171
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3172
        long a = *(long*)(src1+i);
3173
        long b = *(long*)(src2+i);
3174
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3175
    }
3176
    for(; i<w; i++)
3177
        dst[i+0] = src1[i+0]-src2[i+0];
3178
}
3179

    
3180
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3181
    int i;
3182
    uint8_t l, lt;
3183

    
3184
    l= *left;
3185
    lt= *left_top;
3186

    
3187
    for(i=0; i<w; i++){
3188
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3189
        lt= src1[i];
3190
        dst[i]= l;
3191
    }
3192

    
3193
    *left= l;
3194
    *left_top= lt;
3195
}
3196

    
3197
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3198
    int i;
3199
    uint8_t l, lt;
3200

    
3201
    l= *left;
3202
    lt= *left_top;
3203

    
3204
    for(i=0; i<w; i++){
3205
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3206
        lt= src1[i];
3207
        l= src2[i];
3208
        dst[i]= l - pred;
3209
    }
3210

    
3211
    *left= l;
3212
    *left_top= lt;
3213
}
3214

    
3215
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3216
    int i;
3217

    
3218
    for(i=0; i<w-1; i++){
3219
        acc+= src[i];
3220
        dst[i]= acc;
3221
        i++;
3222
        acc+= src[i];
3223
        dst[i]= acc;
3224
    }
3225

    
3226
    for(; i<w; i++){
3227
        acc+= src[i];
3228
        dst[i]= acc;
3229
    }
3230

    
3231
    return acc;
3232
}
3233

    
3234
#if HAVE_BIGENDIAN
3235
#define B 3
3236
#define G 2
3237
#define R 1
3238
#define A 0
3239
#else
3240
#define B 0
3241
#define G 1
3242
#define R 2
3243
#define A 3
3244
#endif
3245
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3246
    int i;
3247
    int r,g,b,a;
3248
    r= *red;
3249
    g= *green;
3250
    b= *blue;
3251
    a= *alpha;
3252

    
3253
    for(i=0; i<w; i++){
3254
        b+= src[4*i+B];
3255
        g+= src[4*i+G];
3256
        r+= src[4*i+R];
3257
        a+= src[4*i+A];
3258

    
3259
        dst[4*i+B]= b;
3260
        dst[4*i+G]= g;
3261
        dst[4*i+R]= r;
3262
        dst[4*i+A]= a;
3263
    }
3264

    
3265
    *red= r;
3266
    *green= g;
3267
    *blue= b;
3268
    *alpha= a;
3269
}
3270
#undef B
3271
#undef G
3272
#undef R
3273
#undef A
3274

    
3275
#define BUTTERFLY2(o1,o2,i1,i2) \
3276
o1= (i1)+(i2);\
3277
o2= (i1)-(i2);
3278

    
3279
#define BUTTERFLY1(x,y) \
3280
{\
3281
    int a,b;\
3282
    a= x;\
3283
    b= y;\
3284
    x= a+b;\
3285
    y= a-b;\
3286
}
3287

    
3288
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3289

    
3290
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3291
    int i;
3292
    int temp[64];
3293
    int sum=0;
3294

    
3295
    assert(h==8);
3296

    
3297
    for(i=0; i<8; i++){
3298
        //FIXME try pointer walks
3299
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3300
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3301
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3302
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3303

    
3304
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3305
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3306
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3307
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3308

    
3309
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3310
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3311
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3312
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3313
    }
3314

    
3315
    for(i=0; i<8; i++){
3316
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3317
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3318
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3319
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3320

    
3321
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3322
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3323
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3324
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3325

    
3326
        sum +=
3327
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3328
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3329
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3330
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3331
    }
3332
#if 0
3333
static int maxi=0;
3334
if(sum>maxi){
3335
    maxi=sum;
3336
    printf("MAX:%d\n", maxi);
3337
}
3338
#endif
3339
    return sum;
3340
}
3341

    
3342
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3343
    int i;
3344
    int temp[64];
3345
    int sum=0;
3346

    
3347
    assert(h==8);
3348

    
3349
    for(i=0; i<8; i++){
3350
        //FIXME try pointer walks
3351
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3352
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3353
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3354
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3355

    
3356
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3357
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3358
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3359
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3360

    
3361
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3362
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3363
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3364
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3365
    }
3366

    
3367
    for(i=0; i<8; i++){
3368
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3369
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3370
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3371
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3372

    
3373
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3374
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3375
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3376
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3377

    
3378
        sum +=
3379
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3380
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3381
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3382
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3383
    }
3384

    
3385
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3386

    
3387
    return sum;
3388
}
3389

    
3390
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3391
    MpegEncContext * const s= (MpegEncContext *)c;
3392
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3393

    
3394
    assert(h==8);
3395

    
3396
    s->dsp.diff_pixels(temp, src1, src2, stride);
3397
    s->dsp.fdct(temp);
3398
    return s->dsp.sum_abs_dctelem(temp);
3399
}
3400

    
3401
#if CONFIG_GPL
3402
#define DCT8_1D {\
3403
    const int s07 = SRC(0) + SRC(7);\
3404
    const int s16 = SRC(1) + SRC(6);\
3405
    const int s25 = SRC(2) + SRC(5);\
3406
    const int s34 = SRC(3) + SRC(4);\
3407
    const int a0 = s07 + s34;\
3408
    const int a1 = s16 + s25;\
3409
    const int a2 = s07 - s34;\
3410
    const int a3 = s16 - s25;\
3411
    const int d07 = SRC(0) - SRC(7);\
3412
    const int d16 = SRC(1) - SRC(6);\
3413
    const int d25 = SRC(2) - SRC(5);\
3414
    const int d34 = SRC(3) - SRC(4);\
3415
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3416
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3417
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3418
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3419
    DST(0,  a0 + a1     ) ;\
3420
    DST(1,  a4 + (a7>>2)) ;\
3421
    DST(2,  a2 + (a3>>1)) ;\
3422
    DST(3,  a5 + (a6>>2)) ;\
3423
    DST(4,  a0 - a1     ) ;\
3424
    DST(5,  a6 - (a5>>2)) ;\
3425
    DST(6, (a2>>1) - a3 ) ;\
3426
    DST(7, (a4>>2) - a7 ) ;\
3427
}
3428

    
3429
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3430
    MpegEncContext * const s= (MpegEncContext *)c;
3431
    DCTELEM dct[8][8];
3432
    int i;
3433
    int sum=0;
3434

    
3435
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3436

    
3437
#define SRC(x) dct[i][x]
3438
#define DST(x,v) dct[i][x]= v
3439
    for( i = 0; i < 8; i++ )
3440
        DCT8_1D
3441
#undef SRC
3442
#undef DST
3443

    
3444
#define SRC(x) dct[x][i]
3445
#define DST(x,v) sum += FFABS(v)
3446
    for( i = 0; i < 8; i++ )
3447
        DCT8_1D
3448
#undef SRC
3449
#undef DST
3450
    return sum;
3451
}
3452
#endif
3453

    
3454
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3455
    MpegEncContext * const s= (MpegEncContext *)c;
3456
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3457
    int sum=0, i;
3458

    
3459
    assert(h==8);
3460

    
3461
    s->dsp.diff_pixels(temp, src1, src2, stride);
3462
    s->dsp.fdct(temp);
3463

    
3464
    for(i=0; i<64; i++)
3465
        sum= FFMAX(sum, FFABS(temp[i]));
3466

    
3467
    return sum;
3468
}
3469

    
3470
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3471
    MpegEncContext * const s= (MpegEncContext *)c;
3472
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3473
    DCTELEM * const bak = temp+64;
3474
    int sum=0, i;
3475

    
3476
    assert(h==8);
3477
    s->mb_intra=0;
3478

    
3479
    s->dsp.diff_pixels(temp, src1, src2, stride);
3480

    
3481
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3482

    
3483
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3484
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3485
    ff_simple_idct(temp); //FIXME
3486

    
3487
    for(i=0; i<64; i++)
3488
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3489

    
3490
    return sum;
3491
}
3492

    
3493
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494
    MpegEncContext * const s= (MpegEncContext *)c;
3495
    const uint8_t *scantable= s->intra_scantable.permutated;
3496
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3497
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3498
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3499
    int i, last, run, bits, level, distortion, start_i;
3500
    const int esc_length= s->ac_esc_length;
3501
    uint8_t * length;
3502
    uint8_t * last_length;
3503

    
3504
    assert(h==8);
3505

    
3506
    copy_block8(lsrc1, src1, 8, stride, 8);
3507
    copy_block8(lsrc2, src2, 8, stride, 8);
3508

    
3509
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3510

    
3511
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3512

    
3513
    bits=0;
3514

    
3515
    if (s->mb_intra) {
3516
        start_i = 1;
3517
        length     = s->intra_ac_vlc_length;
3518
        last_length= s->intra_ac_vlc_last_length;
3519
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3520
    } else {
3521
        start_i = 0;
3522
        length     = s->inter_ac_vlc_length;
3523
        last_length= s->inter_ac_vlc_last_length;
3524
    }
3525

    
3526
    if(last>=start_i){
3527
        run=0;
3528
        for(i=start_i; i<last; i++){
3529
            int j= scantable[i];
3530
            level= temp[j];
3531

    
3532
            if(level){
3533
                level+=64;
3534
                if((level&(~127)) == 0){
3535
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3536
                }else
3537
                    bits+= esc_length;
3538
                run=0;
3539
            }else
3540
                run++;
3541
        }
3542
        i= scantable[last];
3543

    
3544
        level= temp[i] + 64;
3545

    
3546
        assert(level - 64);
3547

    
3548
        if((level&(~127)) == 0){
3549
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3550
        }else
3551
            bits+= esc_length;
3552

    
3553
    }
3554

    
3555
    if(last>=0){
3556
        if(s->mb_intra)
3557
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3558
        else
3559
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3560
    }
3561

    
3562
    s->dsp.idct_add(lsrc2, 8, temp);
3563

    
3564
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3565

    
3566
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3567
}
3568

    
3569
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570
    MpegEncContext * const s= (MpegEncContext *)c;
3571
    const uint8_t *scantable= s->intra_scantable.permutated;
3572
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3573
    int i, last, run, bits, level, start_i;
3574
    const int esc_length= s->ac_esc_length;
3575
    uint8_t * length;
3576
    uint8_t * last_length;
3577

    
3578
    assert(h==8);
3579

    
3580
    s->dsp.diff_pixels(temp, src1, src2, stride);
3581

    
3582
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3583

    
3584
    bits=0;
3585

    
3586
    if (s->mb_intra) {
3587
        start_i = 1;
3588
        length     = s->intra_ac_vlc_length;
3589
        last_length= s->intra_ac_vlc_last_length;
3590
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3591
    } else {
3592
        start_i = 0;
3593
        length     = s->inter_ac_vlc_length;
3594
        last_length= s->inter_ac_vlc_last_length;
3595
    }
3596

    
3597
    if(last>=start_i){
3598
        run=0;
3599
        for(i=start_i; i<last; i++){
3600
            int j= scantable[i];
3601
            level= temp[j];
3602

    
3603
            if(level){
3604
                level+=64;
3605
                if((level&(~127)) == 0){
3606
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3607
                }else
3608
                    bits+= esc_length;
3609
                run=0;
3610
            }else
3611
                run++;
3612
        }
3613
        i= scantable[last];
3614

    
3615
        level= temp[i] + 64;
3616

    
3617
        assert(level - 64);
3618

    
3619
        if((level&(~127)) == 0){
3620
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3621
        }else
3622
            bits+= esc_length;
3623
    }
3624

    
3625
    return bits;
3626
}
3627

    
3628
#define VSAD_INTRA(size) \
3629
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3630
    int score=0;                                                                                            \
3631
    int x,y;                                                                                                \
3632
                                                                                                            \
3633
    for(y=1; y<h; y++){                                                                                     \
3634
        for(x=0; x<size; x+=4){                                                                             \
3635
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3636
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3637
        }                                                                                                   \
3638
        s+= stride;                                                                                         \
3639
    }                                                                                                       \
3640
                                                                                                            \
3641
    return score;                                                                                           \
3642
}
3643
VSAD_INTRA(8)
3644
VSAD_INTRA(16)
3645

    
3646
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3647
    int score=0;
3648
    int x,y;
3649

    
3650
    for(y=1; y<h; y++){
3651
        for(x=0; x<16; x++){
3652
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3653
        }
3654
        s1+= stride;
3655
        s2+= stride;
3656
    }
3657

    
3658
    return score;
3659
}
3660

    
3661
#define SQ(a) ((a)*(a))
3662
#define VSSE_INTRA(size) \
3663
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3664
    int score=0;                                                                                            \
3665
    int x,y;                                                                                                \
3666
                                                                                                            \
3667
    for(y=1; y<h; y++){                                                                                     \
3668
        for(x=0; x<size; x+=4){                                                                               \
3669
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3670
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3671
        }                                                                                                   \
3672
        s+= stride;                                                                                         \
3673
    }                                                                                                       \
3674
                                                                                                            \
3675
    return score;                                                                                           \
3676
}
3677
VSSE_INTRA(8)
3678
VSSE_INTRA(16)
3679

    
3680
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3681
    int score=0;
3682
    int x,y;
3683

    
3684
    for(y=1; y<h; y++){
3685
        for(x=0; x<16; x++){
3686
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3687
        }
3688
        s1+= stride;
3689
        s2+= stride;
3690
    }
3691

    
3692
    return score;
3693
}
3694

    
3695
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3696
                               int size){
3697
    int score=0;
3698
    int i;
3699
    for(i=0; i<size; i++)
3700
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3701
    return score;
3702
}
3703

    
3704
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3705
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3706
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3707
#if CONFIG_GPL
3708
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3709
#endif
3710
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3711
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3712
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3713
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3714

    
3715
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3716
    int i;
3717
    for(i=0; i<len; i++)
3718
        dst[i] = src0[i] * src1[i];
3719
}
3720

    
3721
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3722
    int i;
3723
    src1 += len-1;
3724
    for(i=0; i<len; i++)
3725
        dst[i] = src0[i] * src1[-i];
3726
}
3727

    
3728
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3729
    int i;
3730
    for(i=0; i<len; i++)
3731
        dst[i] = src0[i] * src1[i] + src2[i];
3732
}
3733

    
3734
static void vector_fmul_window_c(float *dst, const float *src0,
3735
                                 const float *src1, const float *win, int len)
3736
{
3737
    int i,j;
3738
    dst += len;
3739
    win += len;
3740
    src0+= len;
3741
    for(i=-len, j=len-1; i<0; i++, j--) {
3742
        float s0 = src0[i];
3743
        float s1 = src1[j];
3744
        float wi = win[i];
3745
        float wj = win[j];
3746
        dst[i] = s0*wj - s1*wi;
3747
        dst[j] = s0*wi + s1*wj;
3748
    }
3749
}
3750

    
3751
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3752
                                 int len)
3753
{
3754
    int i;
3755
    for (i = 0; i < len; i++)
3756
        dst[i] = src[i] * mul;
3757
}
3758

    
3759
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3760
                                      const float **sv, float mul, int len)
3761
{
3762
    int i;
3763
    for (i = 0; i < len; i += 2, sv++) {
3764
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3765
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3766
    }
3767
}
3768

    
3769
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3770
                                      const float **sv, float mul, int len)
3771
{
3772
    int i;
3773
    for (i = 0; i < len; i += 4, sv++) {
3774
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3775
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3776
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3777
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3778
    }
3779
}
3780

    
3781
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3782
                               int len)
3783
{
3784
    int i;
3785
    for (i = 0; i < len; i += 2, sv++) {
3786
        dst[i  ] = sv[0][0] * mul;
3787
        dst[i+1] = sv[0][1] * mul;
3788
    }
3789
}
3790

    
3791
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3792
                               int len)
3793
{
3794
    int i;
3795
    for (i = 0; i < len; i += 4, sv++) {
3796
        dst[i  ] = sv[0][0] * mul;
3797
        dst[i+1] = sv[0][1] * mul;
3798
        dst[i+2] = sv[0][2] * mul;
3799
        dst[i+3] = sv[0][3] * mul;
3800
    }
3801
}
3802

    
3803
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3804
                                int len)
3805
{
3806
    int i;
3807
    for (i = 0; i < len; i++) {
3808
        float t = v1[i] - v2[i];
3809
        v1[i] += v2[i];
3810
        v2[i] = t;
3811
    }
3812
}
3813

    
3814
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3815
{
3816
    float p = 0.0;
3817
    int i;
3818

    
3819
    for (i = 0; i < len; i++)
3820
        p += v1[i] * v2[i];
3821

    
3822
    return p;
3823
}
3824

    
3825
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3826
                   uint32_t maxi, uint32_t maxisign)
3827
{
3828

    
3829
    if(a > mini) return mini;
3830
    else if((a^(1U<<31)) > maxisign) return maxi;
3831
    else return a;
3832
}
3833

    
3834
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3835
    int i;
3836
    uint32_t mini = *(uint32_t*)min;
3837
    uint32_t maxi = *(uint32_t*)max;
3838
    uint32_t maxisign = maxi ^ (1U<<31);
3839
    uint32_t *dsti = (uint32_t*)dst;
3840
    const uint32_t *srci = (const uint32_t*)src;
3841
    for(i=0; i<len; i+=8) {
3842
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3843
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3844
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3845
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3846
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3847
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3848
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3849
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3850
    }
3851
}
3852
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3853
    int i;
3854
    if(min < 0 && max > 0) {
3855
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3856
    } else {
3857
        for(i=0; i < len; i+=8) {
3858
            dst[i    ] = av_clipf(src[i    ], min, max);
3859
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3860
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3861
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3862
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3863
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3864
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3865
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3866
        }
3867
    }
3868
}
3869

    
3870
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3871
{
3872
    int res = 0;
3873

    
3874
    while (order--)
3875
        res += (*v1++ * *v2++) >> shift;
3876

    
3877
    return res;
3878
}
3879

    
3880
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3881
{
3882
    int res = 0;
3883
    while (order--) {
3884
        res   += *v1 * *v2++;
3885
        *v1++ += mul * *v3++;
3886
    }
3887
    return res;
3888
}
3889

    
3890
static void apply_window_int16_c(int16_t *output, const int16_t *input,
3891
                                 const int16_t *window, unsigned int len)
3892
{
3893
    int i;
3894
    int len2 = len >> 1;
3895

    
3896
    for (i = 0; i < len2; i++) {
3897
        int16_t w       = window[i];
3898
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
3899
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
3900
    }
3901
}
3902

    
3903
#define W0 2048
3904
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3905
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3906
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3907
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3908
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3909
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3910
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3911

    
3912
static void wmv2_idct_row(short * b)
3913
{
3914
    int s1,s2;
3915
    int a0,a1,a2,a3,a4,a5,a6,a7;
3916
    /*step 1*/
3917
    a1 = W1*b[1]+W7*b[7];
3918
    a7 = W7*b[1]-W1*b[7];
3919
    a5 = W5*b[5]+W3*b[3];
3920
    a3 = W3*b[5]-W5*b[3];
3921
    a2 = W2*b[2]+W6*b[6];
3922
    a6 = W6*b[2]-W2*b[6];
3923
    a0 = W0*b[0]+W0*b[4];
3924
    a4 = W0*b[0]-W0*b[4];
3925
    /*step 2*/
3926
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3927
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3928
    /*step 3*/
3929
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3930
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3931
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3932
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3933
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3934
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3935
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3936
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3937
}
3938
static void wmv2_idct_col(short * b)
3939
{
3940
    int s1,s2;
3941
    int a0,a1,a2,a3,a4,a5,a6,a7;
3942
    /*step 1, with extended precision*/
3943
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3944
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3945
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3946
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3947
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3948
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3949
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3950
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3951
    /*step 2*/
3952
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3953
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3954
    /*step 3*/
3955
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3956
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3957
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3958
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3959

    
3960
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3961
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3962
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3963
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3964
}
3965
void ff_wmv2_idct_c(short * block){
3966
    int i;
3967

    
3968
    for(i=0;i<64;i+=8){
3969
        wmv2_idct_row(block+i);
3970
    }
3971
    for(i=0;i<8;i++){
3972
        wmv2_idct_col(block+i);
3973
    }
3974
}
3975
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3976
 converted */
3977
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3978
{
3979
    ff_wmv2_idct_c(block);
3980
    ff_put_pixels_clamped_c(block, dest, line_size);
3981
}
3982
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3983
{
3984
    ff_wmv2_idct_c(block);
3985
    ff_add_pixels_clamped_c(block, dest, line_size);
3986
}
3987
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3988
{
3989
    j_rev_dct (block);
3990
    ff_put_pixels_clamped_c(block, dest, line_size);
3991
}
3992
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3993
{
3994
    j_rev_dct (block);
3995
    ff_add_pixels_clamped_c(block, dest, line_size);
3996
}
3997

    
3998
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3999
{
4000
    j_rev_dct4 (block);
4001
    put_pixels_clamped4_c(block, dest, line_size);
4002
}
4003
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4004
{
4005
    j_rev_dct4 (block);
4006
    add_pixels_clamped4_c(block, dest, line_size);
4007
}
4008

    
4009
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4010
{
4011
    j_rev_dct2 (block);
4012
    put_pixels_clamped2_c(block, dest, line_size);
4013
}
4014
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4015
{
4016
    j_rev_dct2 (block);
4017
    add_pixels_clamped2_c(block, dest, line_size);
4018
}
4019

    
4020
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4021
{
4022
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4023

    
4024
    dest[0] = cm[(block[0] + 4)>>3];
4025
}
4026
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4027
{
4028
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4029

    
4030
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4031
}
4032

    
4033
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4034

    
4035
/* init static data */
4036
av_cold void dsputil_static_init(void)
4037
{
4038
    int i;
4039

    
4040
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4041
    for(i=0;i<MAX_NEG_CROP;i++) {
4042
        ff_cropTbl[i] = 0;
4043
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4044
    }
4045

    
4046
    for(i=0;i<512;i++) {
4047
        ff_squareTbl[i] = (i - 256) * (i - 256);
4048
    }
4049

    
4050
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4051
}
4052

    
4053
int ff_check_alignment(void){
4054
    static int did_fail=0;
4055
    DECLARE_ALIGNED(16, int, aligned);
4056

    
4057
    if((intptr_t)&aligned & 15){
4058
        if(!did_fail){
4059
#if HAVE_MMX || HAVE_ALTIVEC
4060
            av_log(NULL, AV_LOG_ERROR,
4061
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4062
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4063
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4064
                "Do not report crashes to FFmpeg developers.\n");
4065
#endif
4066
            did_fail=1;
4067
        }
4068
        return -1;
4069
    }
4070
    return 0;
4071
}
4072

    
4073
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4074
{
4075
    int i;
4076

    
4077
    ff_check_alignment();
4078

    
4079
#if CONFIG_ENCODERS
4080
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4081
        c->fdct = fdct_ifast;
4082
        c->fdct248 = fdct_ifast248;
4083
    }
4084
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4085
        c->fdct = ff_faandct;
4086
        c->fdct248 = ff_faandct248;
4087
    }
4088
    else {
4089
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4090
        c->fdct248 = ff_fdct248_islow;
4091
    }
4092
#endif //CONFIG_ENCODERS
4093

    
4094
    if(avctx->lowres==1){
4095
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4096
            c->idct_put= ff_jref_idct4_put;
4097
            c->idct_add= ff_jref_idct4_add;
4098
        }else{
4099
            c->idct_put= ff_h264_lowres_idct_put_c;
4100
            c->idct_add= ff_h264_lowres_idct_add_c;
4101
        }
4102
        c->idct    = j_rev_dct4;
4103
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4104
    }else if(avctx->lowres==2){
4105
        c->idct_put= ff_jref_idct2_put;
4106
        c->idct_add= ff_jref_idct2_add;
4107
        c->idct    = j_rev_dct2;
4108
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4109
    }else if(avctx->lowres==3){
4110
        c->idct_put= ff_jref_idct1_put;
4111
        c->idct_add= ff_jref_idct1_add;
4112
        c->idct    = j_rev_dct1;
4113
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4114
    }else{
4115
        if(avctx->idct_algo==FF_IDCT_INT){
4116
            c->idct_put= ff_jref_idct_put;
4117
            c->idct_add= ff_jref_idct_add;
4118
            c->idct    = j_rev_dct;
4119
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4120
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4121
                avctx->idct_algo==FF_IDCT_VP3){
4122
            c->idct_put= ff_vp3_idct_put_c;
4123
            c->idct_add= ff_vp3_idct_add_c;
4124
            c->idct    = ff_vp3_idct_c;
4125
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4126
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4127
            c->idct_put= ff_wmv2_idct_put_c;
4128
            c->idct_add= ff_wmv2_idct_add_c;
4129
            c->idct    = ff_wmv2_idct_c;
4130
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4131
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4132
            c->idct_put= ff_faanidct_put;
4133
            c->idct_add= ff_faanidct_add;
4134
            c->idct    = ff_faanidct;
4135
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4136
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4137
            c->idct_put= ff_ea_idct_put_c;
4138
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4139
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4140
            c->idct     = ff_bink_idct_c;
4141
            c->idct_add = ff_bink_idct_add_c;
4142
            c->idct_put = ff_bink_idct_put_c;
4143
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4144
        }else{ //accurate/default
4145
            c->idct_put= ff_simple_idct_put;
4146
            c->idct_add= ff_simple_idct_add;
4147
            c->idct    = ff_simple_idct;
4148
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4149
        }
4150
    }
4151

    
4152
    c->get_pixels = get_pixels_c;
4153
    c->diff_pixels = diff_pixels_c;
4154
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
4155
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4156
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4157
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
4158
    c->add_pixels8 = add_pixels8_c;
4159
    c->add_pixels4 = add_pixels4_c;
4160
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4161
    c->emulated_edge_mc = ff_emulated_edge_mc;
4162
    c->gmc1 = gmc1_c;
4163
    c->gmc = ff_gmc_c;
4164
    c->clear_block = clear_block_c;
4165
    c->clear_blocks = clear_blocks_c;
4166
    c->pix_sum = pix_sum_c;
4167
    c->pix_norm1 = pix_norm1_c;
4168

    
4169
    c->fill_block_tab[0] = fill_block16_c;
4170
    c->fill_block_tab[1] = fill_block8_c;
4171
    c->scale_block = scale_block_c;
4172

    
4173
    /* TODO [0] 16  [1] 8 */
4174
    c->pix_abs[0][0] = pix_abs16_c;
4175
    c->pix_abs[0][1] = pix_abs16_x2_c;
4176
    c->pix_abs[0][2] = pix_abs16_y2_c;
4177
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4178
    c->pix_abs[1][0] = pix_abs8_c;
4179
    c->pix_abs[1][1] = pix_abs8_x2_c;
4180
    c->pix_abs[1][2] = pix_abs8_y2_c;
4181
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4182

    
4183
#define dspfunc(PFX, IDX, NUM) \
4184
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4185
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4186
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4187
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4188

    
4189
    dspfunc(put, 0, 16);
4190
    dspfunc(put_no_rnd, 0, 16);
4191
    dspfunc(put, 1, 8);
4192
    dspfunc(put_no_rnd, 1, 8);
4193
    dspfunc(put, 2, 4);
4194
    dspfunc(put, 3, 2);
4195

    
4196
    dspfunc(avg, 0, 16);
4197
    dspfunc(avg_no_rnd, 0, 16);
4198
    dspfunc(avg, 1, 8);
4199
    dspfunc(avg_no_rnd, 1, 8);
4200
    dspfunc(avg, 2, 4);
4201
    dspfunc(avg, 3, 2);
4202
#undef dspfunc
4203

    
4204
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4205
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4206

    
4207
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4208
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4209
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4210
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4211
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4212
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4213
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4214
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4215
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4216

    
4217
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4218
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4219
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4220
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4221
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4222
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4223
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4224
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4225
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4226

    
4227
#define dspfunc(PFX, IDX, NUM) \
4228
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4229
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4230
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4231
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4232
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4233
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4234
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4235
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4236
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4237
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4238
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4239
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4240
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4241
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4242
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4243
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4244

    
4245
    dspfunc(put_qpel, 0, 16);
4246
    dspfunc(put_no_rnd_qpel, 0, 16);
4247

    
4248
    dspfunc(avg_qpel, 0, 16);
4249
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4250

    
4251
    dspfunc(put_qpel, 1, 8);
4252
    dspfunc(put_no_rnd_qpel, 1, 8);
4253

    
4254
    dspfunc(avg_qpel, 1, 8);
4255
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4256

    
4257
    dspfunc(put_h264_qpel, 0, 16);
4258
    dspfunc(put_h264_qpel, 1, 8);
4259
    dspfunc(put_h264_qpel, 2, 4);
4260
    dspfunc(put_h264_qpel, 3, 2);
4261
    dspfunc(avg_h264_qpel, 0, 16);
4262
    dspfunc(avg_h264_qpel, 1, 8);
4263
    dspfunc(avg_h264_qpel, 2, 4);
4264

    
4265
#undef dspfunc
4266
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4267
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4268
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4269
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4270
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4271
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4272

    
4273
    c->draw_edges = draw_edges_c;
4274

    
4275
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4276
    ff_mlp_init(c, avctx);
4277
#endif
4278
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4279
    ff_intrax8dsp_init(c,avctx);
4280
#endif
4281
#if CONFIG_RV30_DECODER
4282
    ff_rv30dsp_init(c,avctx);
4283
#endif
4284
#if CONFIG_RV40_DECODER
4285
    ff_rv40dsp_init(c,avctx);
4286
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4287
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4288
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4289
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4290
#endif
4291

    
4292
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4293
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4294
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4295
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4296
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4297
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4298
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4299
    c->put_mspel_pixels_tab[7]