Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ fe2ff6d2

History | View | Annotate | Download (157 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavcore/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* Input permutation for the simple_idct_mmx */
100
static const uint8_t simple_mmx_permutation[64]={
101
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109
};
110

    
111
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
112

    
113
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114
    int i;
115
    int end;
116

    
117
    st->scantable= src_scantable;
118

    
119
    for(i=0; i<64; i++){
120
        int j;
121
        j = src_scantable[i];
122
        st->permutated[i] = permutation[j];
123
#if ARCH_PPC
124
        st->inverse[j] = i;
125
#endif
126
    }
127

    
128
    end=-1;
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = st->permutated[i];
132
        if(j>end) end=j;
133
        st->raster_end[i]= end;
134
    }
135
}
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = ff_squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= av_bswap32(src[i+0]);
212
        dst[i+1]= av_bswap32(src[i+1]);
213
        dst[i+2]= av_bswap32(src[i+2]);
214
        dst[i+3]= av_bswap32(src[i+3]);
215
        dst[i+4]= av_bswap32(src[i+4]);
216
        dst[i+5]= av_bswap32(src[i+5]);
217
        dst[i+6]= av_bswap32(src[i+6]);
218
        dst[i+7]= av_bswap32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
    }
223
}
224

    
225
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226
{
227
    int s, i;
228
    uint32_t *sq = ff_squareTbl + 256;
229

    
230
    s = 0;
231
    for (i = 0; i < h; i++) {
232
        s += sq[pix1[0] - pix2[0]];
233
        s += sq[pix1[1] - pix2[1]];
234
        s += sq[pix1[2] - pix2[2]];
235
        s += sq[pix1[3] - pix2[3]];
236
        pix1 += line_size;
237
        pix2 += line_size;
238
    }
239
    return s;
240
}
241

    
242
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243
{
244
    int s, i;
245
    uint32_t *sq = ff_squareTbl + 256;
246

    
247
    s = 0;
248
    for (i = 0; i < h; i++) {
249
        s += sq[pix1[0] - pix2[0]];
250
        s += sq[pix1[1] - pix2[1]];
251
        s += sq[pix1[2] - pix2[2]];
252
        s += sq[pix1[3] - pix2[3]];
253
        s += sq[pix1[4] - pix2[4]];
254
        s += sq[pix1[5] - pix2[5]];
255
        s += sq[pix1[6] - pix2[6]];
256
        s += sq[pix1[7] - pix2[7]];
257
        pix1 += line_size;
258
        pix2 += line_size;
259
    }
260
    return s;
261
}
262

    
263
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264
{
265
    int s, i;
266
    uint32_t *sq = ff_squareTbl + 256;
267

    
268
    s = 0;
269
    for (i = 0; i < h; i++) {
270
        s += sq[pix1[ 0] - pix2[ 0]];
271
        s += sq[pix1[ 1] - pix2[ 1]];
272
        s += sq[pix1[ 2] - pix2[ 2]];
273
        s += sq[pix1[ 3] - pix2[ 3]];
274
        s += sq[pix1[ 4] - pix2[ 4]];
275
        s += sq[pix1[ 5] - pix2[ 5]];
276
        s += sq[pix1[ 6] - pix2[ 6]];
277
        s += sq[pix1[ 7] - pix2[ 7]];
278
        s += sq[pix1[ 8] - pix2[ 8]];
279
        s += sq[pix1[ 9] - pix2[ 9]];
280
        s += sq[pix1[10] - pix2[10]];
281
        s += sq[pix1[11] - pix2[11]];
282
        s += sq[pix1[12] - pix2[12]];
283
        s += sq[pix1[13] - pix2[13]];
284
        s += sq[pix1[14] - pix2[14]];
285
        s += sq[pix1[15] - pix2[15]];
286

    
287
        pix1 += line_size;
288
        pix2 += line_size;
289
    }
290
    return s;
291
}
292

    
293
/* draw the edges of width 'w' of an image of size width, height */
294
//FIXME check that this is ok for mpeg4 interlaced
295
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
296
{
297
    uint8_t *ptr, *last_line;
298
    int i;
299

    
300
    last_line = buf + (height - 1) * wrap;
301
    for(i=0;i<w;i++) {
302
        /* top and bottom */
303
        memcpy(buf - (i + 1) * wrap, buf, width);
304
        memcpy(last_line + (i + 1) * wrap, last_line, width);
305
    }
306
    /* left and right */
307
    ptr = buf;
308
    for(i=0;i<height;i++) {
309
        memset(ptr - w, ptr[0], w);
310
        memset(ptr + width, ptr[width-1], w);
311
        ptr += wrap;
312
    }
313
    /* corners */
314
    for(i=0;i<w;i++) {
315
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
316
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
317
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
318
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
319
    }
320
}
321

    
322
/**
323
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
324
 * @param buf destination buffer
325
 * @param src source buffer
326
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
327
 * @param block_w width of block
328
 * @param block_h height of block
329
 * @param src_x x coordinate of the top left sample of the block in the source buffer
330
 * @param src_y y coordinate of the top left sample of the block in the source buffer
331
 * @param w width of the source buffer
332
 * @param h height of the source buffer
333
 */
334
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
335
                                    int src_x, int src_y, int w, int h){
336
    int x, y;
337
    int start_y, start_x, end_y, end_x;
338

    
339
    if(src_y>= h){
340
        src+= (h-1-src_y)*linesize;
341
        src_y=h-1;
342
    }else if(src_y<=-block_h){
343
        src+= (1-block_h-src_y)*linesize;
344
        src_y=1-block_h;
345
    }
346
    if(src_x>= w){
347
        src+= (w-1-src_x);
348
        src_x=w-1;
349
    }else if(src_x<=-block_w){
350
        src+= (1-block_w-src_x);
351
        src_x=1-block_w;
352
    }
353

    
354
    start_y= FFMAX(0, -src_y);
355
    start_x= FFMAX(0, -src_x);
356
    end_y= FFMIN(block_h, h-src_y);
357
    end_x= FFMIN(block_w, w-src_x);
358
    assert(start_y < end_y && block_h);
359
    assert(start_x < end_x && block_w);
360

    
361
    w    = end_x - start_x;
362
    src += start_y*linesize + start_x;
363
    buf += start_x;
364

    
365
    //top
366
    for(y=0; y<start_y; y++){
367
        memcpy(buf, src, w);
368
        buf += linesize;
369
    }
370

    
371
    // copy existing part
372
    for(; y<end_y; y++){
373
        memcpy(buf, src, w);
374
        src += linesize;
375
        buf += linesize;
376
    }
377

    
378
    //bottom
379
    src -= linesize;
380
    for(; y<block_h; y++){
381
        memcpy(buf, src, w);
382
        buf += linesize;
383
    }
384

    
385
    buf -= block_h * linesize + start_x;
386
    while (block_h--){
387
       //left
388
        for(x=0; x<start_x; x++){
389
            buf[x] = buf[start_x];
390
        }
391

    
392
       //right
393
        for(x=end_x; x<block_w; x++){
394
            buf[x] = buf[end_x - 1];
395
        }
396
        buf += linesize;
397
    }
398
}
399

    
400
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
401
{
402
    int i;
403

    
404
    /* read the pixels */
405
    for(i=0;i<8;i++) {
406
        block[0] = pixels[0];
407
        block[1] = pixels[1];
408
        block[2] = pixels[2];
409
        block[3] = pixels[3];
410
        block[4] = pixels[4];
411
        block[5] = pixels[5];
412
        block[6] = pixels[6];
413
        block[7] = pixels[7];
414
        pixels += line_size;
415
        block += 8;
416
    }
417
}
418

    
419
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
420
                          const uint8_t *s2, int stride){
421
    int i;
422

    
423
    /* read the pixels */
424
    for(i=0;i<8;i++) {
425
        block[0] = s1[0] - s2[0];
426
        block[1] = s1[1] - s2[1];
427
        block[2] = s1[2] - s2[2];
428
        block[3] = s1[3] - s2[3];
429
        block[4] = s1[4] - s2[4];
430
        block[5] = s1[5] - s2[5];
431
        block[6] = s1[6] - s2[6];
432
        block[7] = s1[7] - s2[7];
433
        s1 += stride;
434
        s2 += stride;
435
        block += 8;
436
    }
437
}
438

    
439

    
440
static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
441
                                 int line_size)
442
{
443
    int i;
444
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
445

    
446
    /* read the pixels */
447
    for(i=0;i<8;i++) {
448
        pixels[0] = cm[block[0]];
449
        pixels[1] = cm[block[1]];
450
        pixels[2] = cm[block[2]];
451
        pixels[3] = cm[block[3]];
452
        pixels[4] = cm[block[4]];
453
        pixels[5] = cm[block[5]];
454
        pixels[6] = cm[block[6]];
455
        pixels[7] = cm[block[7]];
456

    
457
        pixels += line_size;
458
        block += 8;
459
    }
460
}
461

    
462
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
463
                                 int line_size)
464
{
465
    int i;
466
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
467

    
468
    /* read the pixels */
469
    for(i=0;i<4;i++) {
470
        pixels[0] = cm[block[0]];
471
        pixels[1] = cm[block[1]];
472
        pixels[2] = cm[block[2]];
473
        pixels[3] = cm[block[3]];
474

    
475
        pixels += line_size;
476
        block += 8;
477
    }
478
}
479

    
480
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
481
                                 int line_size)
482
{
483
    int i;
484
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
485

    
486
    /* read the pixels */
487
    for(i=0;i<2;i++) {
488
        pixels[0] = cm[block[0]];
489
        pixels[1] = cm[block[1]];
490

    
491
        pixels += line_size;
492
        block += 8;
493
    }
494
}
495

    
496
static void put_signed_pixels_clamped_c(const DCTELEM *block,
497
                                        uint8_t *restrict pixels,
498
                                        int line_size)
499
{
500
    int i, j;
501

    
502
    for (i = 0; i < 8; i++) {
503
        for (j = 0; j < 8; j++) {
504
            if (*block < -128)
505
                *pixels = 0;
506
            else if (*block > 127)
507
                *pixels = 255;
508
            else
509
                *pixels = (uint8_t)(*block + 128);
510
            block++;
511
            pixels++;
512
        }
513
        pixels += (line_size - 8);
514
    }
515
}
516

    
517
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
518
                                    int line_size)
519
{
520
    int i;
521

    
522
    /* read the pixels */
523
    for(i=0;i<8;i++) {
524
        pixels[0] = block[0];
525
        pixels[1] = block[1];
526
        pixels[2] = block[2];
527
        pixels[3] = block[3];
528
        pixels[4] = block[4];
529
        pixels[5] = block[5];
530
        pixels[6] = block[6];
531
        pixels[7] = block[7];
532

    
533
        pixels += line_size;
534
        block += 8;
535
    }
536
}
537

    
538
static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
539
                          int line_size)
540
{
541
    int i;
542
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
543

    
544
    /* read the pixels */
545
    for(i=0;i<8;i++) {
546
        pixels[0] = cm[pixels[0] + block[0]];
547
        pixels[1] = cm[pixels[1] + block[1]];
548
        pixels[2] = cm[pixels[2] + block[2]];
549
        pixels[3] = cm[pixels[3] + block[3]];
550
        pixels[4] = cm[pixels[4] + block[4]];
551
        pixels[5] = cm[pixels[5] + block[5]];
552
        pixels[6] = cm[pixels[6] + block[6]];
553
        pixels[7] = cm[pixels[7] + block[7]];
554
        pixels += line_size;
555
        block += 8;
556
    }
557
}
558

    
559
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
560
                          int line_size)
561
{
562
    int i;
563
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
564

    
565
    /* read the pixels */
566
    for(i=0;i<4;i++) {
567
        pixels[0] = cm[pixels[0] + block[0]];
568
        pixels[1] = cm[pixels[1] + block[1]];
569
        pixels[2] = cm[pixels[2] + block[2]];
570
        pixels[3] = cm[pixels[3] + block[3]];
571
        pixels += line_size;
572
        block += 8;
573
    }
574
}
575

    
576
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
577
                          int line_size)
578
{
579
    int i;
580
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
581

    
582
    /* read the pixels */
583
    for(i=0;i<2;i++) {
584
        pixels[0] = cm[pixels[0] + block[0]];
585
        pixels[1] = cm[pixels[1] + block[1]];
586
        pixels += line_size;
587
        block += 8;
588
    }
589
}
590

    
591
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
592
{
593
    int i;
594
    for(i=0;i<8;i++) {
595
        pixels[0] += block[0];
596
        pixels[1] += block[1];
597
        pixels[2] += block[2];
598
        pixels[3] += block[3];
599
        pixels[4] += block[4];
600
        pixels[5] += block[5];
601
        pixels[6] += block[6];
602
        pixels[7] += block[7];
603
        pixels += line_size;
604
        block += 8;
605
    }
606
}
607

    
608
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
609
{
610
    int i;
611
    for(i=0;i<4;i++) {
612
        pixels[0] += block[0];
613
        pixels[1] += block[1];
614
        pixels[2] += block[2];
615
        pixels[3] += block[3];
616
        pixels += line_size;
617
        block += 4;
618
    }
619
}
620

    
621
static int sum_abs_dctelem_c(DCTELEM *block)
622
{
623
    int sum=0, i;
624
    for(i=0; i<64; i++)
625
        sum+= FFABS(block[i]);
626
    return sum;
627
}
628

    
629
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
630
{
631
    int i;
632

    
633
    for (i = 0; i < h; i++) {
634
        memset(block, value, 16);
635
        block += line_size;
636
    }
637
}
638

    
639
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
640
{
641
    int i;
642

    
643
    for (i = 0; i < h; i++) {
644
        memset(block, value, 8);
645
        block += line_size;
646
    }
647
}
648

    
649
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
650
{
651
    int i, j;
652
    uint16_t *dst1 = (uint16_t *) dst;
653
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
654

    
655
    for (j = 0; j < 8; j++) {
656
        for (i = 0; i < 8; i++) {
657
            dst1[i] = dst2[i] = src[i] * 0x0101;
658
        }
659
        src  += 8;
660
        dst1 += linesize;
661
        dst2 += linesize;
662
    }
663
}
664

    
665
#if 0
666

667
#define PIXOP2(OPNAME, OP) \
668
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
669
{\
670
    int i;\
671
    for(i=0; i<h; i++){\
672
        OP(*((uint64_t*)block), AV_RN64(pixels));\
673
        pixels+=line_size;\
674
        block +=line_size;\
675
    }\
676
}\
677
\
678
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
679
{\
680
    int i;\
681
    for(i=0; i<h; i++){\
682
        const uint64_t a= AV_RN64(pixels  );\
683
        const uint64_t b= AV_RN64(pixels+1);\
684
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
685
        pixels+=line_size;\
686
        block +=line_size;\
687
    }\
688
}\
689
\
690
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
691
{\
692
    int i;\
693
    for(i=0; i<h; i++){\
694
        const uint64_t a= AV_RN64(pixels  );\
695
        const uint64_t b= AV_RN64(pixels+1);\
696
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
697
        pixels+=line_size;\
698
        block +=line_size;\
699
    }\
700
}\
701
\
702
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
703
{\
704
    int i;\
705
    for(i=0; i<h; i++){\
706
        const uint64_t a= AV_RN64(pixels          );\
707
        const uint64_t b= AV_RN64(pixels+line_size);\
708
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
709
        pixels+=line_size;\
710
        block +=line_size;\
711
    }\
712
}\
713
\
714
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
715
{\
716
    int i;\
717
    for(i=0; i<h; i++){\
718
        const uint64_t a= AV_RN64(pixels          );\
719
        const uint64_t b= AV_RN64(pixels+line_size);\
720
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
721
        pixels+=line_size;\
722
        block +=line_size;\
723
    }\
724
}\
725
\
726
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727
{\
728
        int i;\
729
        const uint64_t a= AV_RN64(pixels  );\
730
        const uint64_t b= AV_RN64(pixels+1);\
731
        uint64_t l0=  (a&0x0303030303030303ULL)\
732
                    + (b&0x0303030303030303ULL)\
733
                    + 0x0202020202020202ULL;\
734
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
735
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
736
        uint64_t l1,h1;\
737
\
738
        pixels+=line_size;\
739
        for(i=0; i<h; i+=2){\
740
            uint64_t a= AV_RN64(pixels  );\
741
            uint64_t b= AV_RN64(pixels+1);\
742
            l1=  (a&0x0303030303030303ULL)\
743
               + (b&0x0303030303030303ULL);\
744
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
745
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
746
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
747
            pixels+=line_size;\
748
            block +=line_size;\
749
            a= AV_RN64(pixels  );\
750
            b= AV_RN64(pixels+1);\
751
            l0=  (a&0x0303030303030303ULL)\
752
               + (b&0x0303030303030303ULL)\
753
               + 0x0202020202020202ULL;\
754
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
755
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
756
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
757
            pixels+=line_size;\
758
            block +=line_size;\
759
        }\
760
}\
761
\
762
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
763
{\
764
        int i;\
765
        const uint64_t a= AV_RN64(pixels  );\
766
        const uint64_t b= AV_RN64(pixels+1);\
767
        uint64_t l0=  (a&0x0303030303030303ULL)\
768
                    + (b&0x0303030303030303ULL)\
769
                    + 0x0101010101010101ULL;\
770
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
771
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
772
        uint64_t l1,h1;\
773
\
774
        pixels+=line_size;\
775
        for(i=0; i<h; i+=2){\
776
            uint64_t a= AV_RN64(pixels  );\
777
            uint64_t b= AV_RN64(pixels+1);\
778
            l1=  (a&0x0303030303030303ULL)\
779
               + (b&0x0303030303030303ULL);\
780
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
781
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
782
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
783
            pixels+=line_size;\
784
            block +=line_size;\
785
            a= AV_RN64(pixels  );\
786
            b= AV_RN64(pixels+1);\
787
            l0=  (a&0x0303030303030303ULL)\
788
               + (b&0x0303030303030303ULL)\
789
               + 0x0101010101010101ULL;\
790
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
791
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
792
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
793
            pixels+=line_size;\
794
            block +=line_size;\
795
        }\
796
}\
797
\
798
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
799
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
800
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
801
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
802
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
803
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
804
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
805

806
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
807
#else // 64 bit variant
808

    
809
#define PIXOP2(OPNAME, OP) \
810
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
811
    int i;\
812
    for(i=0; i<h; i++){\
813
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
814
        pixels+=line_size;\
815
        block +=line_size;\
816
    }\
817
}\
818
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
819
    int i;\
820
    for(i=0; i<h; i++){\
821
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
822
        pixels+=line_size;\
823
        block +=line_size;\
824
    }\
825
}\
826
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
827
    int i;\
828
    for(i=0; i<h; i++){\
829
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
830
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
831
        pixels+=line_size;\
832
        block +=line_size;\
833
    }\
834
}\
835
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
837
}\
838
\
839
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
840
                                                int src_stride1, int src_stride2, int h){\
841
    int i;\
842
    for(i=0; i<h; i++){\
843
        uint32_t a,b;\
844
        a= AV_RN32(&src1[i*src_stride1  ]);\
845
        b= AV_RN32(&src2[i*src_stride2  ]);\
846
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
847
        a= AV_RN32(&src1[i*src_stride1+4]);\
848
        b= AV_RN32(&src2[i*src_stride2+4]);\
849
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
850
    }\
851
}\
852
\
853
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
854
                                                int src_stride1, int src_stride2, int h){\
855
    int i;\
856
    for(i=0; i<h; i++){\
857
        uint32_t a,b;\
858
        a= AV_RN32(&src1[i*src_stride1  ]);\
859
        b= AV_RN32(&src2[i*src_stride2  ]);\
860
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
861
        a= AV_RN32(&src1[i*src_stride1+4]);\
862
        b= AV_RN32(&src2[i*src_stride2+4]);\
863
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
864
    }\
865
}\
866
\
867
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
868
                                                int src_stride1, int src_stride2, int h){\
869
    int i;\
870
    for(i=0; i<h; i++){\
871
        uint32_t a,b;\
872
        a= AV_RN32(&src1[i*src_stride1  ]);\
873
        b= AV_RN32(&src2[i*src_stride2  ]);\
874
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
875
    }\
876
}\
877
\
878
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
879
                                                int src_stride1, int src_stride2, int h){\
880
    int i;\
881
    for(i=0; i<h; i++){\
882
        uint32_t a,b;\
883
        a= AV_RN16(&src1[i*src_stride1  ]);\
884
        b= AV_RN16(&src2[i*src_stride2  ]);\
885
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
886
    }\
887
}\
888
\
889
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890
                                                int src_stride1, int src_stride2, int h){\
891
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
892
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
893
}\
894
\
895
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
896
                                                int src_stride1, int src_stride2, int h){\
897
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
898
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
899
}\
900
\
901
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
903
}\
904
\
905
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
907
}\
908
\
909
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
910
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911
}\
912
\
913
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
915
}\
916
\
917
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
918
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
919
    int i;\
920
    for(i=0; i<h; i++){\
921
        uint32_t a, b, c, d, l0, l1, h0, h1;\
922
        a= AV_RN32(&src1[i*src_stride1]);\
923
        b= AV_RN32(&src2[i*src_stride2]);\
924
        c= AV_RN32(&src3[i*src_stride3]);\
925
        d= AV_RN32(&src4[i*src_stride4]);\
926
        l0=  (a&0x03030303UL)\
927
           + (b&0x03030303UL)\
928
           + 0x02020202UL;\
929
        h0= ((a&0xFCFCFCFCUL)>>2)\
930
          + ((b&0xFCFCFCFCUL)>>2);\
931
        l1=  (c&0x03030303UL)\
932
           + (d&0x03030303UL);\
933
        h1= ((c&0xFCFCFCFCUL)>>2)\
934
          + ((d&0xFCFCFCFCUL)>>2);\
935
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
936
        a= AV_RN32(&src1[i*src_stride1+4]);\
937
        b= AV_RN32(&src2[i*src_stride2+4]);\
938
        c= AV_RN32(&src3[i*src_stride3+4]);\
939
        d= AV_RN32(&src4[i*src_stride4+4]);\
940
        l0=  (a&0x03030303UL)\
941
           + (b&0x03030303UL)\
942
           + 0x02020202UL;\
943
        h0= ((a&0xFCFCFCFCUL)>>2)\
944
          + ((b&0xFCFCFCFCUL)>>2);\
945
        l1=  (c&0x03030303UL)\
946
           + (d&0x03030303UL);\
947
        h1= ((c&0xFCFCFCFCUL)>>2)\
948
          + ((d&0xFCFCFCFCUL)>>2);\
949
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
950
    }\
951
}\
952
\
953
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
954
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
955
}\
956
\
957
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
959
}\
960
\
961
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
962
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
963
}\
964
\
965
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
967
}\
968
\
969
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
970
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
971
    int i;\
972
    for(i=0; i<h; i++){\
973
        uint32_t a, b, c, d, l0, l1, h0, h1;\
974
        a= AV_RN32(&src1[i*src_stride1]);\
975
        b= AV_RN32(&src2[i*src_stride2]);\
976
        c= AV_RN32(&src3[i*src_stride3]);\
977
        d= AV_RN32(&src4[i*src_stride4]);\
978
        l0=  (a&0x03030303UL)\
979
           + (b&0x03030303UL)\
980
           + 0x01010101UL;\
981
        h0= ((a&0xFCFCFCFCUL)>>2)\
982
          + ((b&0xFCFCFCFCUL)>>2);\
983
        l1=  (c&0x03030303UL)\
984
           + (d&0x03030303UL);\
985
        h1= ((c&0xFCFCFCFCUL)>>2)\
986
          + ((d&0xFCFCFCFCUL)>>2);\
987
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
988
        a= AV_RN32(&src1[i*src_stride1+4]);\
989
        b= AV_RN32(&src2[i*src_stride2+4]);\
990
        c= AV_RN32(&src3[i*src_stride3+4]);\
991
        d= AV_RN32(&src4[i*src_stride4+4]);\
992
        l0=  (a&0x03030303UL)\
993
           + (b&0x03030303UL)\
994
           + 0x01010101UL;\
995
        h0= ((a&0xFCFCFCFCUL)>>2)\
996
          + ((b&0xFCFCFCFCUL)>>2);\
997
        l1=  (c&0x03030303UL)\
998
           + (d&0x03030303UL);\
999
        h1= ((c&0xFCFCFCFCUL)>>2)\
1000
          + ((d&0xFCFCFCFCUL)>>2);\
1001
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1002
    }\
1003
}\
1004
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1008
}\
1009
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1010
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1011
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1012
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1013
}\
1014
\
1015
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1016
{\
1017
        int i, a0, b0, a1, b1;\
1018
        a0= pixels[0];\
1019
        b0= pixels[1] + 2;\
1020
        a0 += b0;\
1021
        b0 += pixels[2];\
1022
\
1023
        pixels+=line_size;\
1024
        for(i=0; i<h; i+=2){\
1025
            a1= pixels[0];\
1026
            b1= pixels[1];\
1027
            a1 += b1;\
1028
            b1 += pixels[2];\
1029
\
1030
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1031
            block[1]= (b1+b0)>>2;\
1032
\
1033
            pixels+=line_size;\
1034
            block +=line_size;\
1035
\
1036
            a0= pixels[0];\
1037
            b0= pixels[1] + 2;\
1038
            a0 += b0;\
1039
            b0 += pixels[2];\
1040
\
1041
            block[0]= (a1+a0)>>2;\
1042
            block[1]= (b1+b0)>>2;\
1043
            pixels+=line_size;\
1044
            block +=line_size;\
1045
        }\
1046
}\
1047
\
1048
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1049
{\
1050
        int i;\
1051
        const uint32_t a= AV_RN32(pixels  );\
1052
        const uint32_t b= AV_RN32(pixels+1);\
1053
        uint32_t l0=  (a&0x03030303UL)\
1054
                    + (b&0x03030303UL)\
1055
                    + 0x02020202UL;\
1056
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1057
                   + ((b&0xFCFCFCFCUL)>>2);\
1058
        uint32_t l1,h1;\
1059
\
1060
        pixels+=line_size;\
1061
        for(i=0; i<h; i+=2){\
1062
            uint32_t a= AV_RN32(pixels  );\
1063
            uint32_t b= AV_RN32(pixels+1);\
1064
            l1=  (a&0x03030303UL)\
1065
               + (b&0x03030303UL);\
1066
            h1= ((a&0xFCFCFCFCUL)>>2)\
1067
              + ((b&0xFCFCFCFCUL)>>2);\
1068
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069
            pixels+=line_size;\
1070
            block +=line_size;\
1071
            a= AV_RN32(pixels  );\
1072
            b= AV_RN32(pixels+1);\
1073
            l0=  (a&0x03030303UL)\
1074
               + (b&0x03030303UL)\
1075
               + 0x02020202UL;\
1076
            h0= ((a&0xFCFCFCFCUL)>>2)\
1077
              + ((b&0xFCFCFCFCUL)>>2);\
1078
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1079
            pixels+=line_size;\
1080
            block +=line_size;\
1081
        }\
1082
}\
1083
\
1084
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1085
{\
1086
    int j;\
1087
    for(j=0; j<2; j++){\
1088
        int i;\
1089
        const uint32_t a= AV_RN32(pixels  );\
1090
        const uint32_t b= AV_RN32(pixels+1);\
1091
        uint32_t l0=  (a&0x03030303UL)\
1092
                    + (b&0x03030303UL)\
1093
                    + 0x02020202UL;\
1094
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1095
                   + ((b&0xFCFCFCFCUL)>>2);\
1096
        uint32_t l1,h1;\
1097
\
1098
        pixels+=line_size;\
1099
        for(i=0; i<h; i+=2){\
1100
            uint32_t a= AV_RN32(pixels  );\
1101
            uint32_t b= AV_RN32(pixels+1);\
1102
            l1=  (a&0x03030303UL)\
1103
               + (b&0x03030303UL);\
1104
            h1= ((a&0xFCFCFCFCUL)>>2)\
1105
              + ((b&0xFCFCFCFCUL)>>2);\
1106
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1107
            pixels+=line_size;\
1108
            block +=line_size;\
1109
            a= AV_RN32(pixels  );\
1110
            b= AV_RN32(pixels+1);\
1111
            l0=  (a&0x03030303UL)\
1112
               + (b&0x03030303UL)\
1113
               + 0x02020202UL;\
1114
            h0= ((a&0xFCFCFCFCUL)>>2)\
1115
              + ((b&0xFCFCFCFCUL)>>2);\
1116
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1117
            pixels+=line_size;\
1118
            block +=line_size;\
1119
        }\
1120
        pixels+=4-line_size*(h+1);\
1121
        block +=4-line_size*h;\
1122
    }\
1123
}\
1124
\
1125
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1126
{\
1127
    int j;\
1128
    for(j=0; j<2; j++){\
1129
        int i;\
1130
        const uint32_t a= AV_RN32(pixels  );\
1131
        const uint32_t b= AV_RN32(pixels+1);\
1132
        uint32_t l0=  (a&0x03030303UL)\
1133
                    + (b&0x03030303UL)\
1134
                    + 0x01010101UL;\
1135
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1136
                   + ((b&0xFCFCFCFCUL)>>2);\
1137
        uint32_t l1,h1;\
1138
\
1139
        pixels+=line_size;\
1140
        for(i=0; i<h; i+=2){\
1141
            uint32_t a= AV_RN32(pixels  );\
1142
            uint32_t b= AV_RN32(pixels+1);\
1143
            l1=  (a&0x03030303UL)\
1144
               + (b&0x03030303UL);\
1145
            h1= ((a&0xFCFCFCFCUL)>>2)\
1146
              + ((b&0xFCFCFCFCUL)>>2);\
1147
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148
            pixels+=line_size;\
1149
            block +=line_size;\
1150
            a= AV_RN32(pixels  );\
1151
            b= AV_RN32(pixels+1);\
1152
            l0=  (a&0x03030303UL)\
1153
               + (b&0x03030303UL)\
1154
               + 0x01010101UL;\
1155
            h0= ((a&0xFCFCFCFCUL)>>2)\
1156
              + ((b&0xFCFCFCFCUL)>>2);\
1157
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1158
            pixels+=line_size;\
1159
            block +=line_size;\
1160
        }\
1161
        pixels+=4-line_size*(h+1);\
1162
        block +=4-line_size*h;\
1163
    }\
1164
}\
1165
\
1166
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1167
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1168
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1169
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1170
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1171
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1172
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1173
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1174

    
1175
#define op_avg(a, b) a = rnd_avg32(a, b)
1176
#endif
1177
#define op_put(a, b) a = b
1178

    
1179
PIXOP2(avg, op_avg)
1180
PIXOP2(put, op_put)
1181
#undef op_avg
1182
#undef op_put
1183

    
1184
#define put_no_rnd_pixels8_c  put_pixels8_c
1185
#define put_no_rnd_pixels16_c put_pixels16_c
1186

    
1187
#define avg2(a,b) ((a+b+1)>>1)
1188
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1189

    
1190
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1191
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1192
}
1193

    
1194
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1195
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1196
}
1197

    
1198
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1199
{
1200
    const int A=(16-x16)*(16-y16);
1201
    const int B=(   x16)*(16-y16);
1202
    const int C=(16-x16)*(   y16);
1203
    const int D=(   x16)*(   y16);
1204
    int i;
1205

    
1206
    for(i=0; i<h; i++)
1207
    {
1208
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1209
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1210
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1211
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1212
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1213
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1214
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1215
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1216
        dst+= stride;
1217
        src+= stride;
1218
    }
1219
}
1220

    
1221
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1222
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1223
{
1224
    int y, vx, vy;
1225
    const int s= 1<<shift;
1226

    
1227
    width--;
1228
    height--;
1229

    
1230
    for(y=0; y<h; y++){
1231
        int x;
1232

    
1233
        vx= ox;
1234
        vy= oy;
1235
        for(x=0; x<8; x++){ //XXX FIXME optimize
1236
            int src_x, src_y, frac_x, frac_y, index;
1237

    
1238
            src_x= vx>>16;
1239
            src_y= vy>>16;
1240
            frac_x= src_x&(s-1);
1241
            frac_y= src_y&(s-1);
1242
            src_x>>=shift;
1243
            src_y>>=shift;
1244

    
1245
            if((unsigned)src_x < width){
1246
                if((unsigned)src_y < height){
1247
                    index= src_x + src_y*stride;
1248
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1249
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1250
                                        + (  src[index+stride  ]*(s-frac_x)
1251
                                           + src[index+stride+1]*   frac_x )*   frac_y
1252
                                        + r)>>(shift*2);
1253
                }else{
1254
                    index= src_x + av_clip(src_y, 0, height)*stride;
1255
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1256
                                          + src[index       +1]*   frac_x )*s
1257
                                        + r)>>(shift*2);
1258
                }
1259
            }else{
1260
                if((unsigned)src_y < height){
1261
                    index= av_clip(src_x, 0, width) + src_y*stride;
1262
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1263
                                           + src[index+stride  ]*   frac_y )*s
1264
                                        + r)>>(shift*2);
1265
                }else{
1266
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1267
                    dst[y*stride + x]=    src[index         ];
1268
                }
1269
            }
1270

    
1271
            vx+= dxx;
1272
            vy+= dyx;
1273
        }
1274
        ox += dxy;
1275
        oy += dyy;
1276
    }
1277
}
1278

    
1279
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280
    switch(width){
1281
    case 2: put_pixels2_c (dst, src, stride, height); break;
1282
    case 4: put_pixels4_c (dst, src, stride, height); break;
1283
    case 8: put_pixels8_c (dst, src, stride, height); break;
1284
    case 16:put_pixels16_c(dst, src, stride, height); break;
1285
    }
1286
}
1287

    
1288
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1289
    int i,j;
1290
    for (i=0; i < height; i++) {
1291
      for (j=0; j < width; j++) {
1292
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1293
      }
1294
      src += stride;
1295
      dst += stride;
1296
    }
1297
}
1298

    
1299
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300
    int i,j;
1301
    for (i=0; i < height; i++) {
1302
      for (j=0; j < width; j++) {
1303
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1304
      }
1305
      src += stride;
1306
      dst += stride;
1307
    }
1308
}
1309

    
1310
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311
    int i,j;
1312
    for (i=0; i < height; i++) {
1313
      for (j=0; j < width; j++) {
1314
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1315
      }
1316
      src += stride;
1317
      dst += stride;
1318
    }
1319
}
1320

    
1321
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322
    int i,j;
1323
    for (i=0; i < height; i++) {
1324
      for (j=0; j < width; j++) {
1325
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1326
      }
1327
      src += stride;
1328
      dst += stride;
1329
    }
1330
}
1331

    
1332
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333
    int i,j;
1334
    for (i=0; i < height; i++) {
1335
      for (j=0; j < width; j++) {
1336
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1337
      }
1338
      src += stride;
1339
      dst += stride;
1340
    }
1341
}
1342

    
1343
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344
    int i,j;
1345
    for (i=0; i < height; i++) {
1346
      for (j=0; j < width; j++) {
1347
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1348
      }
1349
      src += stride;
1350
      dst += stride;
1351
    }
1352
}
1353

    
1354
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355
    int i,j;
1356
    for (i=0; i < height; i++) {
1357
      for (j=0; j < width; j++) {
1358
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1359
      }
1360
      src += stride;
1361
      dst += stride;
1362
    }
1363
}
1364

    
1365
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366
    int i,j;
1367
    for (i=0; i < height; i++) {
1368
      for (j=0; j < width; j++) {
1369
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1370
      }
1371
      src += stride;
1372
      dst += stride;
1373
    }
1374
}
1375

    
1376
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377
    switch(width){
1378
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1379
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1380
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1381
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1382
    }
1383
}
1384

    
1385
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386
    int i,j;
1387
    for (i=0; i < height; i++) {
1388
      for (j=0; j < width; j++) {
1389
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1390
      }
1391
      src += stride;
1392
      dst += stride;
1393
    }
1394
}
1395

    
1396
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397
    int i,j;
1398
    for (i=0; i < height; i++) {
1399
      for (j=0; j < width; j++) {
1400
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1401
      }
1402
      src += stride;
1403
      dst += stride;
1404
    }
1405
}
1406

    
1407
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408
    int i,j;
1409
    for (i=0; i < height; i++) {
1410
      for (j=0; j < width; j++) {
1411
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1412
      }
1413
      src += stride;
1414
      dst += stride;
1415
    }
1416
}
1417

    
1418
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419
    int i,j;
1420
    for (i=0; i < height; i++) {
1421
      for (j=0; j < width; j++) {
1422
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1423
      }
1424
      src += stride;
1425
      dst += stride;
1426
    }
1427
}
1428

    
1429
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430
    int i,j;
1431
    for (i=0; i < height; i++) {
1432
      for (j=0; j < width; j++) {
1433
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1434
      }
1435
      src += stride;
1436
      dst += stride;
1437
    }
1438
}
1439

    
1440
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441
    int i,j;
1442
    for (i=0; i < height; i++) {
1443
      for (j=0; j < width; j++) {
1444
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1445
      }
1446
      src += stride;
1447
      dst += stride;
1448
    }
1449
}
1450

    
1451
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452
    int i,j;
1453
    for (i=0; i < height; i++) {
1454
      for (j=0; j < width; j++) {
1455
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1456
      }
1457
      src += stride;
1458
      dst += stride;
1459
    }
1460
}
1461

    
1462
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1463
    int i,j;
1464
    for (i=0; i < height; i++) {
1465
      for (j=0; j < width; j++) {
1466
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1467
      }
1468
      src += stride;
1469
      dst += stride;
1470
    }
1471
}
1472
#if 0
1473
#define TPEL_WIDTH(width)\
1474
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1476
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1478
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1480
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1482
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1484
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1486
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1488
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1490
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1492
#endif
1493

    
1494
#define H264_CHROMA_MC(OPNAME, OP)\
1495
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1496
    const int A=(8-x)*(8-y);\
1497
    const int B=(  x)*(8-y);\
1498
    const int C=(8-x)*(  y);\
1499
    const int D=(  x)*(  y);\
1500
    int i;\
1501
    \
1502
    assert(x<8 && y<8 && x>=0 && y>=0);\
1503
\
1504
    if(D){\
1505
        for(i=0; i<h; i++){\
1506
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1507
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1508
            dst+= stride;\
1509
            src+= stride;\
1510
        }\
1511
    }else{\
1512
        const int E= B+C;\
1513
        const int step= C ? stride : 1;\
1514
        for(i=0; i<h; i++){\
1515
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1516
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1517
            dst+= stride;\
1518
            src+= stride;\
1519
        }\
1520
    }\
1521
}\
1522
\
1523
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524
    const int A=(8-x)*(8-y);\
1525
    const int B=(  x)*(8-y);\
1526
    const int C=(8-x)*(  y);\
1527
    const int D=(  x)*(  y);\
1528
    int i;\
1529
    \
1530
    assert(x<8 && y<8 && x>=0 && y>=0);\
1531
\
1532
    if(D){\
1533
        for(i=0; i<h; i++){\
1534
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1537
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1538
            dst+= stride;\
1539
            src+= stride;\
1540
        }\
1541
    }else{\
1542
        const int E= B+C;\
1543
        const int step= C ? stride : 1;\
1544
        for(i=0; i<h; i++){\
1545
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1546
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1547
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1548
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1549
            dst+= stride;\
1550
            src+= stride;\
1551
        }\
1552
    }\
1553
}\
1554
\
1555
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1556
    const int A=(8-x)*(8-y);\
1557
    const int B=(  x)*(8-y);\
1558
    const int C=(8-x)*(  y);\
1559
    const int D=(  x)*(  y);\
1560
    int i;\
1561
    \
1562
    assert(x<8 && y<8 && x>=0 && y>=0);\
1563
\
1564
    if(D){\
1565
        for(i=0; i<h; i++){\
1566
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1567
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1568
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1569
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1570
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1571
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1572
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1573
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1574
            dst+= stride;\
1575
            src+= stride;\
1576
        }\
1577
    }else{\
1578
        const int E= B+C;\
1579
        const int step= C ? stride : 1;\
1580
        for(i=0; i<h; i++){\
1581
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1582
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1583
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1584
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1585
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1586
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1587
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1588
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1589
            dst+= stride;\
1590
            src+= stride;\
1591
        }\
1592
    }\
1593
}
1594

    
1595
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1596
#define op_put(a, b) a = (((b) + 32)>>6)
1597

    
1598
H264_CHROMA_MC(put_       , op_put)
1599
H264_CHROMA_MC(avg_       , op_avg)
1600
#undef op_avg
1601
#undef op_put
1602

    
1603
static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1604
    const int A=(8-x)*(8-y);
1605
    const int B=(  x)*(8-y);
1606
    const int C=(8-x)*(  y);
1607
    const int D=(  x)*(  y);
1608
    int i;
1609

    
1610
    assert(x<8 && y<8 && x>=0 && y>=0);
1611

    
1612
    for(i=0; i<h; i++)
1613
    {
1614
        dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1615
        dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1616
        dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1617
        dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1618
        dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1619
        dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1620
        dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1621
        dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1622
        dst+= stride;
1623
        src+= stride;
1624
    }
1625
}
1626

    
1627
static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1628
    const int A=(8-x)*(8-y);
1629
    const int B=(  x)*(8-y);
1630
    const int C=(8-x)*(  y);
1631
    const int D=(  x)*(  y);
1632
    int i;
1633

    
1634
    assert(x<8 && y<8 && x>=0 && y>=0);
1635

    
1636
    for(i=0; i<h; i++)
1637
    {
1638
        dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1639
        dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1640
        dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1641
        dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1642
        dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1643
        dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1644
        dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1645
        dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1646
        dst+= stride;
1647
        src+= stride;
1648
    }
1649
}
1650

    
1651
#define QPEL_MC(r, OPNAME, RND, OP) \
1652
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1653
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1654
    int i;\
1655
    for(i=0; i<h; i++)\
1656
    {\
1657
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1658
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1659
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1660
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1661
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1662
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1663
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1664
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1665
        dst+=dstStride;\
1666
        src+=srcStride;\
1667
    }\
1668
}\
1669
\
1670
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1671
    const int w=8;\
1672
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1673
    int i;\
1674
    for(i=0; i<w; i++)\
1675
    {\
1676
        const int src0= src[0*srcStride];\
1677
        const int src1= src[1*srcStride];\
1678
        const int src2= src[2*srcStride];\
1679
        const int src3= src[3*srcStride];\
1680
        const int src4= src[4*srcStride];\
1681
        const int src5= src[5*srcStride];\
1682
        const int src6= src[6*srcStride];\
1683
        const int src7= src[7*srcStride];\
1684
        const int src8= src[8*srcStride];\
1685
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1686
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1687
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1688
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1689
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1690
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1691
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1692
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1693
        dst++;\
1694
        src++;\
1695
    }\
1696
}\
1697
\
1698
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1699
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1700
    int i;\
1701
    \
1702
    for(i=0; i<h; i++)\
1703
    {\
1704
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1705
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1706
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1707
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1708
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1709
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1710
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1711
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1712
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1713
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1714
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1715
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1716
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1717
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1718
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1719
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1720
        dst+=dstStride;\
1721
        src+=srcStride;\
1722
    }\
1723
}\
1724
\
1725
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727
    int i;\
1728
    const int w=16;\
1729
    for(i=0; i<w; i++)\
1730
    {\
1731
        const int src0= src[0*srcStride];\
1732
        const int src1= src[1*srcStride];\
1733
        const int src2= src[2*srcStride];\
1734
        const int src3= src[3*srcStride];\
1735
        const int src4= src[4*srcStride];\
1736
        const int src5= src[5*srcStride];\
1737
        const int src6= src[6*srcStride];\
1738
        const int src7= src[7*srcStride];\
1739
        const int src8= src[8*srcStride];\
1740
        const int src9= src[9*srcStride];\
1741
        const int src10= src[10*srcStride];\
1742
        const int src11= src[11*srcStride];\
1743
        const int src12= src[12*srcStride];\
1744
        const int src13= src[13*srcStride];\
1745
        const int src14= src[14*srcStride];\
1746
        const int src15= src[15*srcStride];\
1747
        const int src16= src[16*srcStride];\
1748
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1749
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1750
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1751
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1752
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1753
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1754
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1755
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1756
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1757
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1758
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1759
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1760
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1761
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1762
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1763
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1764
        dst++;\
1765
        src++;\
1766
    }\
1767
}\
1768
\
1769
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1770
    uint8_t half[64];\
1771
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1772
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1773
}\
1774
\
1775
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1776
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1777
}\
1778
\
1779
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1780
    uint8_t half[64];\
1781
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1782
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1783
}\
1784
\
1785
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1786
    uint8_t full[16*9];\
1787
    uint8_t half[64];\
1788
    copy_block9(full, src, 16, stride, 9);\
1789
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1790
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1791
}\
1792
\
1793
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1794
    uint8_t full[16*9];\
1795
    copy_block9(full, src, 16, stride, 9);\
1796
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1797
}\
1798
\
1799
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1800
    uint8_t full[16*9];\
1801
    uint8_t half[64];\
1802
    copy_block9(full, src, 16, stride, 9);\
1803
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1804
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1805
}\
1806
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1807
    uint8_t full[16*9];\
1808
    uint8_t halfH[72];\
1809
    uint8_t halfV[64];\
1810
    uint8_t halfHV[64];\
1811
    copy_block9(full, src, 16, stride, 9);\
1812
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1814
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1816
}\
1817
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1818
    uint8_t full[16*9];\
1819
    uint8_t halfH[72];\
1820
    uint8_t halfHV[64];\
1821
    copy_block9(full, src, 16, stride, 9);\
1822
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1823
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1824
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1825
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1826
}\
1827
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1828
    uint8_t full[16*9];\
1829
    uint8_t halfH[72];\
1830
    uint8_t halfV[64];\
1831
    uint8_t halfHV[64];\
1832
    copy_block9(full, src, 16, stride, 9);\
1833
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1834
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1835
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1836
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1837
}\
1838
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1839
    uint8_t full[16*9];\
1840
    uint8_t halfH[72];\
1841
    uint8_t halfHV[64];\
1842
    copy_block9(full, src, 16, stride, 9);\
1843
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1844
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1845
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1846
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1847
}\
1848
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1849
    uint8_t full[16*9];\
1850
    uint8_t halfH[72];\
1851
    uint8_t halfV[64];\
1852
    uint8_t halfHV[64];\
1853
    copy_block9(full, src, 16, stride, 9);\
1854
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1856
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1858
}\
1859
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1860
    uint8_t full[16*9];\
1861
    uint8_t halfH[72];\
1862
    uint8_t halfHV[64];\
1863
    copy_block9(full, src, 16, stride, 9);\
1864
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1865
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1866
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1867
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1868
}\
1869
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870
    uint8_t full[16*9];\
1871
    uint8_t halfH[72];\
1872
    uint8_t halfV[64];\
1873
    uint8_t halfHV[64];\
1874
    copy_block9(full, src, 16, stride, 9);\
1875
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1876
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1877
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1879
}\
1880
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1881
    uint8_t full[16*9];\
1882
    uint8_t halfH[72];\
1883
    uint8_t halfHV[64];\
1884
    copy_block9(full, src, 16, stride, 9);\
1885
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1886
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1887
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1889
}\
1890
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1891
    uint8_t halfH[72];\
1892
    uint8_t halfHV[64];\
1893
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1894
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1896
}\
1897
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1898
    uint8_t halfH[72];\
1899
    uint8_t halfHV[64];\
1900
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1901
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1902
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1903
}\
1904
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1905
    uint8_t full[16*9];\
1906
    uint8_t halfH[72];\
1907
    uint8_t halfV[64];\
1908
    uint8_t halfHV[64];\
1909
    copy_block9(full, src, 16, stride, 9);\
1910
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1911
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1912
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1913
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1914
}\
1915
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1916
    uint8_t full[16*9];\
1917
    uint8_t halfH[72];\
1918
    copy_block9(full, src, 16, stride, 9);\
1919
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1921
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1922
}\
1923
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1924
    uint8_t full[16*9];\
1925
    uint8_t halfH[72];\
1926
    uint8_t halfV[64];\
1927
    uint8_t halfHV[64];\
1928
    copy_block9(full, src, 16, stride, 9);\
1929
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1931
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1933
}\
1934
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1935
    uint8_t full[16*9];\
1936
    uint8_t halfH[72];\
1937
    copy_block9(full, src, 16, stride, 9);\
1938
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1939
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1940
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1941
}\
1942
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1943
    uint8_t halfH[72];\
1944
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1945
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1946
}\
1947
\
1948
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1949
    uint8_t half[256];\
1950
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1951
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1952
}\
1953
\
1954
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1955
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1956
}\
1957
\
1958
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1959
    uint8_t half[256];\
1960
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1961
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1962
}\
1963
\
1964
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1965
    uint8_t full[24*17];\
1966
    uint8_t half[256];\
1967
    copy_block17(full, src, 24, stride, 17);\
1968
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1969
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1970
}\
1971
\
1972
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1973
    uint8_t full[24*17];\
1974
    copy_block17(full, src, 24, stride, 17);\
1975
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1976
}\
1977
\
1978
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1979
    uint8_t full[24*17];\
1980
    uint8_t half[256];\
1981
    copy_block17(full, src, 24, stride, 17);\
1982
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1983
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1984
}\
1985
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1986
    uint8_t full[24*17];\
1987
    uint8_t halfH[272];\
1988
    uint8_t halfV[256];\
1989
    uint8_t halfHV[256];\
1990
    copy_block17(full, src, 24, stride, 17);\
1991
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1992
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1993
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1995
}\
1996
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1997
    uint8_t full[24*17];\
1998
    uint8_t halfH[272];\
1999
    uint8_t halfHV[256];\
2000
    copy_block17(full, src, 24, stride, 17);\
2001
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2005
}\
2006
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t full[24*17];\
2008
    uint8_t halfH[272];\
2009
    uint8_t halfV[256];\
2010
    uint8_t halfHV[256];\
2011
    copy_block17(full, src, 24, stride, 17);\
2012
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2014
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2016
}\
2017
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2018
    uint8_t full[24*17];\
2019
    uint8_t halfH[272];\
2020
    uint8_t halfHV[256];\
2021
    copy_block17(full, src, 24, stride, 17);\
2022
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2024
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2025
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2026
}\
2027
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2028
    uint8_t full[24*17];\
2029
    uint8_t halfH[272];\
2030
    uint8_t halfV[256];\
2031
    uint8_t halfHV[256];\
2032
    copy_block17(full, src, 24, stride, 17);\
2033
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2034
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2035
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2037
}\
2038
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2039
    uint8_t full[24*17];\
2040
    uint8_t halfH[272];\
2041
    uint8_t halfHV[256];\
2042
    copy_block17(full, src, 24, stride, 17);\
2043
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2044
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2045
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2046
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2047
}\
2048
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2049
    uint8_t full[24*17];\
2050
    uint8_t halfH[272];\
2051
    uint8_t halfV[256];\
2052
    uint8_t halfHV[256];\
2053
    copy_block17(full, src, 24, stride, 17);\
2054
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2055
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2056
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2057
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2058
}\
2059
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2060
    uint8_t full[24*17];\
2061
    uint8_t halfH[272];\
2062
    uint8_t halfHV[256];\
2063
    copy_block17(full, src, 24, stride, 17);\
2064
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2065
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2066
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2067
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2068
}\
2069
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2070
    uint8_t halfH[272];\
2071
    uint8_t halfHV[256];\
2072
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2075
}\
2076
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2077
    uint8_t halfH[272];\
2078
    uint8_t halfHV[256];\
2079
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2080
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2082
}\
2083
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084
    uint8_t full[24*17];\
2085
    uint8_t halfH[272];\
2086
    uint8_t halfV[256];\
2087
    uint8_t halfHV[256];\
2088
    copy_block17(full, src, 24, stride, 17);\
2089
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2090
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2091
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2093
}\
2094
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2095
    uint8_t full[24*17];\
2096
    uint8_t halfH[272];\
2097
    copy_block17(full, src, 24, stride, 17);\
2098
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2100
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2101
}\
2102
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2103
    uint8_t full[24*17];\
2104
    uint8_t halfH[272];\
2105
    uint8_t halfV[256];\
2106
    uint8_t halfHV[256];\
2107
    copy_block17(full, src, 24, stride, 17);\
2108
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2109
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2110
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2111
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2112
}\
2113
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2114
    uint8_t full[24*17];\
2115
    uint8_t halfH[272];\
2116
    copy_block17(full, src, 24, stride, 17);\
2117
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2118
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2119
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2120
}\
2121
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2122
    uint8_t halfH[272];\
2123
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2124
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2125
}
2126

    
2127
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2128
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2129
#define op_put(a, b) a = cm[((b) + 16)>>5]
2130
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2131

    
2132
QPEL_MC(0, put_       , _       , op_put)
2133
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2134
QPEL_MC(0, avg_       , _       , op_avg)
2135
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2136
#undef op_avg
2137
#undef op_avg_no_rnd
2138
#undef op_put
2139
#undef op_put_no_rnd
2140

    
2141
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
2142
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2143
#define put_qpel16_mc00_c ff_put_pixels16x16_c
2144
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2145
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2146
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2147

    
2148
#if 1
2149
#define H264_LOWPASS(OPNAME, OP, OP2) \
2150
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2151
    const int h=2;\
2152
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2153
    int i;\
2154
    for(i=0; i<h; i++)\
2155
    {\
2156
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2157
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2158
        dst+=dstStride;\
2159
        src+=srcStride;\
2160
    }\
2161
}\
2162
\
2163
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2164
    const int w=2;\
2165
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2166
    int i;\
2167
    for(i=0; i<w; i++)\
2168
    {\
2169
        const int srcB= src[-2*srcStride];\
2170
        const int srcA= src[-1*srcStride];\
2171
        const int src0= src[0 *srcStride];\
2172
        const int src1= src[1 *srcStride];\
2173
        const int src2= src[2 *srcStride];\
2174
        const int src3= src[3 *srcStride];\
2175
        const int src4= src[4 *srcStride];\
2176
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2177
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2178
        dst++;\
2179
        src++;\
2180
    }\
2181
}\
2182
\
2183
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2184
    const int h=2;\
2185
    const int w=2;\
2186
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2187
    int i;\
2188
    src -= 2*srcStride;\
2189
    for(i=0; i<h+5; i++)\
2190
    {\
2191
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2192
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2193
        tmp+=tmpStride;\
2194
        src+=srcStride;\
2195
    }\
2196
    tmp -= tmpStride*(h+5-2);\
2197
    for(i=0; i<w; i++)\
2198
    {\
2199
        const int tmpB= tmp[-2*tmpStride];\
2200
        const int tmpA= tmp[-1*tmpStride];\
2201
        const int tmp0= tmp[0 *tmpStride];\
2202
        const int tmp1= tmp[1 *tmpStride];\
2203
        const int tmp2= tmp[2 *tmpStride];\
2204
        const int tmp3= tmp[3 *tmpStride];\
2205
        const int tmp4= tmp[4 *tmpStride];\
2206
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2207
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2208
        dst++;\
2209
        tmp++;\
2210
    }\
2211
}\
2212
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2213
    const int h=4;\
2214
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215
    int i;\
2216
    for(i=0; i<h; i++)\
2217
    {\
2218
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2219
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2220
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2221
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2222
        dst+=dstStride;\
2223
        src+=srcStride;\
2224
    }\
2225
}\
2226
\
2227
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2228
    const int w=4;\
2229
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2230
    int i;\
2231
    for(i=0; i<w; i++)\
2232
    {\
2233
        const int srcB= src[-2*srcStride];\
2234
        const int srcA= src[-1*srcStride];\
2235
        const int src0= src[0 *srcStride];\
2236
        const int src1= src[1 *srcStride];\
2237
        const int src2= src[2 *srcStride];\
2238
        const int src3= src[3 *srcStride];\
2239
        const int src4= src[4 *srcStride];\
2240
        const int src5= src[5 *srcStride];\
2241
        const int src6= src[6 *srcStride];\
2242
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246
        dst++;\
2247
        src++;\
2248
    }\
2249
}\
2250
\
2251
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252
    const int h=4;\
2253
    const int w=4;\
2254
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2255
    int i;\
2256
    src -= 2*srcStride;\
2257
    for(i=0; i<h+5; i++)\
2258
    {\
2259
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2260
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2261
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2262
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2263
        tmp+=tmpStride;\
2264
        src+=srcStride;\
2265
    }\
2266
    tmp -= tmpStride*(h+5-2);\
2267
    for(i=0; i<w; i++)\
2268
    {\
2269
        const int tmpB= tmp[-2*tmpStride];\
2270
        const int tmpA= tmp[-1*tmpStride];\
2271
        const int tmp0= tmp[0 *tmpStride];\
2272
        const int tmp1= tmp[1 *tmpStride];\
2273
        const int tmp2= tmp[2 *tmpStride];\
2274
        const int tmp3= tmp[3 *tmpStride];\
2275
        const int tmp4= tmp[4 *tmpStride];\
2276
        const int tmp5= tmp[5 *tmpStride];\
2277
        const int tmp6= tmp[6 *tmpStride];\
2278
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2279
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2280
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2281
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2282
        dst++;\
2283
        tmp++;\
2284
    }\
2285
}\
2286
\
2287
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2288
    const int h=8;\
2289
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2290
    int i;\
2291
    for(i=0; i<h; i++)\
2292
    {\
2293
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2294
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2295
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2296
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2297
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2298
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2299
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2300
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2301
        dst+=dstStride;\
2302
        src+=srcStride;\
2303
    }\
2304
}\
2305
\
2306
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2307
    const int w=8;\
2308
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309
    int i;\
2310
    for(i=0; i<w; i++)\
2311
    {\
2312
        const int srcB= src[-2*srcStride];\
2313
        const int srcA= src[-1*srcStride];\
2314
        const int src0= src[0 *srcStride];\
2315
        const int src1= src[1 *srcStride];\
2316
        const int src2= src[2 *srcStride];\
2317
        const int src3= src[3 *srcStride];\
2318
        const int src4= src[4 *srcStride];\
2319
        const int src5= src[5 *srcStride];\
2320
        const int src6= src[6 *srcStride];\
2321
        const int src7= src[7 *srcStride];\
2322
        const int src8= src[8 *srcStride];\
2323
        const int src9= src[9 *srcStride];\
2324
        const int src10=src[10*srcStride];\
2325
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2326
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2327
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2328
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2329
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2330
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2331
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2332
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2333
        dst++;\
2334
        src++;\
2335
    }\
2336
}\
2337
\
2338
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2339
    const int h=8;\
2340
    const int w=8;\
2341
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2342
    int i;\
2343
    src -= 2*srcStride;\
2344
    for(i=0; i<h+5; i++)\
2345
    {\
2346
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2347
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2348
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2349
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2350
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2351
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2352
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2353
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2354
        tmp+=tmpStride;\
2355
        src+=srcStride;\
2356
    }\
2357
    tmp -= tmpStride*(h+5-2);\
2358
    for(i=0; i<w; i++)\
2359
    {\
2360
        const int tmpB= tmp[-2*tmpStride];\
2361
        const int tmpA= tmp[-1*tmpStride];\
2362
        const int tmp0= tmp[0 *tmpStride];\
2363
        const int tmp1= tmp[1 *tmpStride];\
2364
        const int tmp2= tmp[2 *tmpStride];\
2365
        const int tmp3= tmp[3 *tmpStride];\
2366
        const int tmp4= tmp[4 *tmpStride];\
2367
        const int tmp5= tmp[5 *tmpStride];\
2368
        const int tmp6= tmp[6 *tmpStride];\
2369
        const int tmp7= tmp[7 *tmpStride];\
2370
        const int tmp8= tmp[8 *tmpStride];\
2371
        const int tmp9= tmp[9 *tmpStride];\
2372
        const int tmp10=tmp[10*tmpStride];\
2373
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2374
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2375
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2376
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2377
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2378
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2379
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2380
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2381
        dst++;\
2382
        tmp++;\
2383
    }\
2384
}\
2385
\
2386
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2387
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2388
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2389
    src += 8*srcStride;\
2390
    dst += 8*dstStride;\
2391
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2392
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2393
}\
2394
\
2395
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2396
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2397
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2398
    src += 8*srcStride;\
2399
    dst += 8*dstStride;\
2400
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2401
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2402
}\
2403
\
2404
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2405
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2406
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2407
    src += 8*srcStride;\
2408
    dst += 8*dstStride;\
2409
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2410
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2411
}\
2412

    
2413
#define H264_MC(OPNAME, SIZE) \
2414
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2415
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2416
}\
2417
\
2418
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2419
    uint8_t half[SIZE*SIZE];\
2420
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2421
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2422
}\
2423
\
2424
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2425
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2426
}\
2427
\
2428
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2429
    uint8_t half[SIZE*SIZE];\
2430
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2431
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2432
}\
2433
\
2434
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2435
    uint8_t full[SIZE*(SIZE+5)];\
2436
    uint8_t * const full_mid= full + SIZE*2;\
2437
    uint8_t half[SIZE*SIZE];\
2438
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2439
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2440
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2441
}\
2442
\
2443
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2444
    uint8_t full[SIZE*(SIZE+5)];\
2445
    uint8_t * const full_mid= full + SIZE*2;\
2446
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2447
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2448
}\
2449
\
2450
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2451
    uint8_t full[SIZE*(SIZE+5)];\
2452
    uint8_t * const full_mid= full + SIZE*2;\
2453
    uint8_t half[SIZE*SIZE];\
2454
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2455
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2456
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2457
}\
2458
\
2459
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2460
    uint8_t full[SIZE*(SIZE+5)];\
2461
    uint8_t * const full_mid= full + SIZE*2;\
2462
    uint8_t halfH[SIZE*SIZE];\
2463
    uint8_t halfV[SIZE*SIZE];\
2464
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2466
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2467
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2468
}\
2469
\
2470
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2471
    uint8_t full[SIZE*(SIZE+5)];\
2472
    uint8_t * const full_mid= full + SIZE*2;\
2473
    uint8_t halfH[SIZE*SIZE];\
2474
    uint8_t halfV[SIZE*SIZE];\
2475
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2476
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2477
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2478
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2479
}\
2480
\
2481
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2482
    uint8_t full[SIZE*(SIZE+5)];\
2483
    uint8_t * const full_mid= full + SIZE*2;\
2484
    uint8_t halfH[SIZE*SIZE];\
2485
    uint8_t halfV[SIZE*SIZE];\
2486
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2487
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2488
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2489
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2490
}\
2491
\
2492
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2493
    uint8_t full[SIZE*(SIZE+5)];\
2494
    uint8_t * const full_mid= full + SIZE*2;\
2495
    uint8_t halfH[SIZE*SIZE];\
2496
    uint8_t halfV[SIZE*SIZE];\
2497
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2498
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2499
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2500
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2501
}\
2502
\
2503
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2504
    int16_t tmp[SIZE*(SIZE+5)];\
2505
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2506
}\
2507
\
2508
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2509
    int16_t tmp[SIZE*(SIZE+5)];\
2510
    uint8_t halfH[SIZE*SIZE];\
2511
    uint8_t halfHV[SIZE*SIZE];\
2512
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2513
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2514
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2515
}\
2516
\
2517
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2518
    int16_t tmp[SIZE*(SIZE+5)];\
2519
    uint8_t halfH[SIZE*SIZE];\
2520
    uint8_t halfHV[SIZE*SIZE];\
2521
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2522
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2523
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2524
}\
2525
\
2526
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2527
    uint8_t full[SIZE*(SIZE+5)];\
2528
    uint8_t * const full_mid= full + SIZE*2;\
2529
    int16_t tmp[SIZE*(SIZE+5)];\
2530
    uint8_t halfV[SIZE*SIZE];\
2531
    uint8_t halfHV[SIZE*SIZE];\
2532
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2533
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2534
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2535
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2536
}\
2537
\
2538
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2539
    uint8_t full[SIZE*(SIZE+5)];\
2540
    uint8_t * const full_mid= full + SIZE*2;\
2541
    int16_t tmp[SIZE*(SIZE+5)];\
2542
    uint8_t halfV[SIZE*SIZE];\
2543
    uint8_t halfHV[SIZE*SIZE];\
2544
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2545
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2547
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2548
}\
2549

    
2550
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2551
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2552
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2553
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2554
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2555

    
2556
H264_LOWPASS(put_       , op_put, op2_put)
2557
H264_LOWPASS(avg_       , op_avg, op2_avg)
2558
H264_MC(put_, 2)
2559
H264_MC(put_, 4)
2560
H264_MC(put_, 8)
2561
H264_MC(put_, 16)
2562
H264_MC(avg_, 4)
2563
H264_MC(avg_, 8)
2564
H264_MC(avg_, 16)
2565

    
2566
#undef op_avg
2567
#undef op_put
2568
#undef op2_avg
2569
#undef op2_put
2570
#endif
2571

    
2572
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2573
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2574
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2575
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2576

    
2577
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2578
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2579
    int i;
2580

    
2581
    for(i=0; i<h; i++){
2582
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2583
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2584
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2585
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2586
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2587
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2588
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2589
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2590
        dst+=dstStride;
2591
        src+=srcStride;
2592
    }
2593
}
2594

    
2595
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2596
    put_pixels8_c(dst, src, stride, 8);
2597
}
2598
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2599
    avg_pixels8_c(dst, src, stride, 8);
2600
}
2601
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2602
    put_pixels16_c(dst, src, stride, 16);
2603
}
2604
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2605
    avg_pixels16_c(dst, src, stride, 16);
2606
}
2607

    
2608
#if CONFIG_RV40_DECODER
2609
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2610
    put_pixels16_xy2_c(dst, src, stride, 16);
2611
}
2612
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2613
    avg_pixels16_xy2_c(dst, src, stride, 16);
2614
}
2615
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2616
    put_pixels8_xy2_c(dst, src, stride, 8);
2617
}
2618
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2619
    avg_pixels8_xy2_c(dst, src, stride, 8);
2620
}
2621
#endif /* CONFIG_RV40_DECODER */
2622

    
2623
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2624
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2625
    int i;
2626

    
2627
    for(i=0; i<w; i++){
2628
        const int src_1= src[ -srcStride];
2629
        const int src0 = src[0          ];
2630
        const int src1 = src[  srcStride];
2631
        const int src2 = src[2*srcStride];
2632
        const int src3 = src[3*srcStride];
2633
        const int src4 = src[4*srcStride];
2634
        const int src5 = src[5*srcStride];
2635
        const int src6 = src[6*srcStride];
2636
        const int src7 = src[7*srcStride];
2637
        const int src8 = src[8*srcStride];
2638
        const int src9 = src[9*srcStride];
2639
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2640
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2641
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2642
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2643
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2644
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2645
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2646
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2647
        src++;
2648
        dst++;
2649
    }
2650
}
2651

    
2652
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2653
    uint8_t half[64];
2654
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2655
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2656
}
2657

    
2658
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2659
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2660
}
2661

    
2662
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2663
    uint8_t half[64];
2664
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2665
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2666
}
2667

    
2668
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2669
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2670
}
2671

    
2672
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2673
    uint8_t halfH[88];
2674
    uint8_t halfV[64];
2675
    uint8_t halfHV[64];
2676
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2677
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2678
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2679
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2680
}
2681
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2682
    uint8_t halfH[88];
2683
    uint8_t halfV[64];
2684
    uint8_t halfHV[64];
2685
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2686
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2687
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2688
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2689
}
2690
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2691
    uint8_t halfH[88];
2692
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2693
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2694
}
2695

    
2696
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2697
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2698
    int x;
2699
    const int strength= ff_h263_loop_filter_strength[qscale];
2700

    
2701
    for(x=0; x<8; x++){
2702
        int d1, d2, ad1;
2703
        int p0= src[x-2*stride];
2704
        int p1= src[x-1*stride];
2705
        int p2= src[x+0*stride];
2706
        int p3= src[x+1*stride];
2707
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2708

    
2709
        if     (d<-2*strength) d1= 0;
2710
        else if(d<-  strength) d1=-2*strength - d;
2711
        else if(d<   strength) d1= d;
2712
        else if(d< 2*strength) d1= 2*strength - d;
2713
        else                   d1= 0;
2714

    
2715
        p1 += d1;
2716
        p2 -= d1;
2717
        if(p1&256) p1= ~(p1>>31);
2718
        if(p2&256) p2= ~(p2>>31);
2719

    
2720
        src[x-1*stride] = p1;
2721
        src[x+0*stride] = p2;
2722

    
2723
        ad1= FFABS(d1)>>1;
2724

    
2725
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2726

    
2727
        src[x-2*stride] = p0 - d2;
2728
        src[x+  stride] = p3 + d2;
2729
    }
2730
    }
2731
}
2732

    
2733
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2734
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2735
    int y;
2736
    const int strength= ff_h263_loop_filter_strength[qscale];
2737

    
2738
    for(y=0; y<8; y++){
2739
        int d1, d2, ad1;
2740
        int p0= src[y*stride-2];
2741
        int p1= src[y*stride-1];
2742
        int p2= src[y*stride+0];
2743
        int p3= src[y*stride+1];
2744
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2745

    
2746
        if     (d<-2*strength) d1= 0;
2747
        else if(d<-  strength) d1=-2*strength - d;
2748
        else if(d<   strength) d1= d;
2749
        else if(d< 2*strength) d1= 2*strength - d;
2750
        else                   d1= 0;
2751

    
2752
        p1 += d1;
2753
        p2 -= d1;
2754
        if(p1&256) p1= ~(p1>>31);
2755
        if(p2&256) p2= ~(p2>>31);
2756

    
2757
        src[y*stride-1] = p1;
2758
        src[y*stride+0] = p2;
2759

    
2760
        ad1= FFABS(d1)>>1;
2761

    
2762
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2763

    
2764
        src[y*stride-2] = p0 - d2;
2765
        src[y*stride+1] = p3 + d2;
2766
    }
2767
    }
2768
}
2769

    
2770
static void h261_loop_filter_c(uint8_t *src, int stride){
2771
    int x,y,xy,yz;
2772
    int temp[64];
2773

    
2774
    for(x=0; x<8; x++){
2775
        temp[x      ] = 4*src[x           ];
2776
        temp[x + 7*8] = 4*src[x + 7*stride];
2777
    }
2778
    for(y=1; y<7; y++){
2779
        for(x=0; x<8; x++){
2780
            xy = y * stride + x;
2781
            yz = y * 8 + x;
2782
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2783
        }
2784
    }
2785

    
2786
    for(y=0; y<8; y++){
2787
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2788
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2789
        for(x=1; x<7; x++){
2790
            xy = y * stride + x;
2791
            yz = y * 8 + x;
2792
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2793
        }
2794
    }
2795
}
2796

    
2797
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2798
{
2799
    int s, i;
2800

    
2801
    s = 0;
2802
    for(i=0;i<h;i++) {
2803
        s += abs(pix1[0] - pix2[0]);
2804
        s += abs(pix1[1] - pix2[1]);
2805
        s += abs(pix1[2] - pix2[2]);
2806
        s += abs(pix1[3] - pix2[3]);
2807
        s += abs(pix1[4] - pix2[4]);
2808
        s += abs(pix1[5] - pix2[5]);
2809
        s += abs(pix1[6] - pix2[6]);
2810
        s += abs(pix1[7] - pix2[7]);
2811
        s += abs(pix1[8] - pix2[8]);
2812
        s += abs(pix1[9] - pix2[9]);
2813
        s += abs(pix1[10] - pix2[10]);
2814
        s += abs(pix1[11] - pix2[11]);
2815
        s += abs(pix1[12] - pix2[12]);
2816
        s += abs(pix1[13] - pix2[13]);
2817
        s += abs(pix1[14] - pix2[14]);
2818
        s += abs(pix1[15] - pix2[15]);
2819
        pix1 += line_size;
2820
        pix2 += line_size;
2821
    }
2822
    return s;
2823
}
2824

    
2825
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2826
{
2827
    int s, i;
2828

    
2829
    s = 0;
2830
    for(i=0;i<h;i++) {
2831
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2832
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2833
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2834
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2835
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2836
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2837
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2838
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2839
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2840
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2841
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2842
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2843
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2844
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2845
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2846
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2847
        pix1 += line_size;
2848
        pix2 += line_size;
2849
    }
2850
    return s;
2851
}
2852

    
2853
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2854
{
2855
    int s, i;
2856
    uint8_t *pix3 = pix2 + line_size;
2857

    
2858
    s = 0;
2859
    for(i=0;i<h;i++) {
2860
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2861
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2862
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2863
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2864
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2865
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2866
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2867
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2868
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2869
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2870
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2871
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2872
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2873
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2874
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2875
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2876
        pix1 += line_size;
2877
        pix2 += line_size;
2878
        pix3 += line_size;
2879
    }
2880
    return s;
2881
}
2882

    
2883
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2884
{
2885
    int s, i;
2886
    uint8_t *pix3 = pix2 + line_size;
2887

    
2888
    s = 0;
2889
    for(i=0;i<h;i++) {
2890
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2891
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2892
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2893
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2894
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2895
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2896
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2897
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2898
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2899
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2900
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2901
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2902
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2903
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2904
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2905
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2906
        pix1 += line_size;
2907
        pix2 += line_size;
2908
        pix3 += line_size;
2909
    }
2910
    return s;
2911
}
2912

    
2913
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2914
{
2915
    int s, i;
2916

    
2917
    s = 0;
2918
    for(i=0;i<h;i++) {
2919
        s += abs(pix1[0] - pix2[0]);
2920
        s += abs(pix1[1] - pix2[1]);
2921
        s += abs(pix1[2] - pix2[2]);
2922
        s += abs(pix1[3] - pix2[3]);
2923
        s += abs(pix1[4] - pix2[4]);
2924
        s += abs(pix1[5] - pix2[5]);
2925
        s += abs(pix1[6] - pix2[6]);
2926
        s += abs(pix1[7] - pix2[7]);
2927
        pix1 += line_size;
2928
        pix2 += line_size;
2929
    }
2930
    return s;
2931
}
2932

    
2933
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934
{
2935
    int s, i;
2936

    
2937
    s = 0;
2938
    for(i=0;i<h;i++) {
2939
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2940
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2941
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2942
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2943
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2944
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2945
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2946
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2947
        pix1 += line_size;
2948
        pix2 += line_size;
2949
    }
2950
    return s;
2951
}
2952

    
2953
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2954
{
2955
    int s, i;
2956
    uint8_t *pix3 = pix2 + line_size;
2957

    
2958
    s = 0;
2959
    for(i=0;i<h;i++) {
2960
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2961
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2962
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2963
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2964
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2965
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2966
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2967
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2968
        pix1 += line_size;
2969
        pix2 += line_size;
2970
        pix3 += line_size;
2971
    }
2972
    return s;
2973
}
2974

    
2975
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2976
{
2977
    int s, i;
2978
    uint8_t *pix3 = pix2 + line_size;
2979

    
2980
    s = 0;
2981
    for(i=0;i<h;i++) {
2982
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2983
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2984
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2985
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2986
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2987
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2988
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2989
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2990
        pix1 += line_size;
2991
        pix2 += line_size;
2992
        pix3 += line_size;
2993
    }
2994
    return s;
2995
}
2996

    
2997
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2998
    MpegEncContext *c = v;
2999
    int score1=0;
3000
    int score2=0;
3001
    int x,y;
3002

    
3003
    for(y=0; y<h; y++){
3004
        for(x=0; x<16; x++){
3005
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3006
        }
3007
        if(y+1<h){
3008
            for(x=0; x<15; x++){
3009
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3010
                             - s1[x+1] + s1[x+1+stride])
3011
                        -FFABS(  s2[x  ] - s2[x  +stride]
3012
                             - s2[x+1] + s2[x+1+stride]);
3013
            }
3014
        }
3015
        s1+= stride;
3016
        s2+= stride;
3017
    }
3018

    
3019
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3020
    else  return score1 + FFABS(score2)*8;
3021
}
3022

    
3023
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3024
    MpegEncContext *c = v;
3025
    int score1=0;
3026
    int score2=0;
3027
    int x,y;
3028

    
3029
    for(y=0; y<h; y++){
3030
        for(x=0; x<8; x++){
3031
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3032
        }
3033
        if(y+1<h){
3034
            for(x=0; x<7; x++){
3035
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
3036
                             - s1[x+1] + s1[x+1+stride])
3037
                        -FFABS(  s2[x  ] - s2[x  +stride]
3038
                             - s2[x+1] + s2[x+1+stride]);
3039
            }
3040
        }
3041
        s1+= stride;
3042
        s2+= stride;
3043
    }
3044

    
3045
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3046
    else  return score1 + FFABS(score2)*8;
3047
}
3048

    
3049
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3050
    int i;
3051
    unsigned int sum=0;
3052

    
3053
    for(i=0; i<8*8; i++){
3054
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3055
        int w= weight[i];
3056
        b>>= RECON_SHIFT;
3057
        assert(-512<b && b<512);
3058

    
3059
        sum += (w*b)*(w*b)>>4;
3060
    }
3061
    return sum>>2;
3062
}
3063

    
3064
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3065
    int i;
3066

    
3067
    for(i=0; i<8*8; i++){
3068
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3069
    }
3070
}
3071

    
3072
/**
3073
 * permutes an 8x8 block.
3074
 * @param block the block which will be permuted according to the given permutation vector
3075
 * @param permutation the permutation vector
3076
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3077
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3078
 *                  (inverse) permutated to scantable order!
3079
 */
3080
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3081
{
3082
    int i;
3083
    DCTELEM temp[64];
3084

    
3085
    if(last<=0) return;
3086
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3087

    
3088
    for(i=0; i<=last; i++){
3089
        const int j= scantable[i];
3090
        temp[j]= block[j];
3091
        block[j]=0;
3092
    }
3093

    
3094
    for(i=0; i<=last; i++){
3095
        const int j= scantable[i];
3096
        const int perm_j= permutation[j];
3097
        block[perm_j]= temp[j];
3098
    }
3099
}
3100

    
3101
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3102
    return 0;
3103
}
3104

    
3105
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3106
    int i;
3107

    
3108
    memset(cmp, 0, sizeof(void*)*6);
3109

    
3110
    for(i=0; i<6; i++){
3111
        switch(type&0xFF){
3112
        case FF_CMP_SAD:
3113
            cmp[i]= c->sad[i];
3114
            break;
3115
        case FF_CMP_SATD:
3116
            cmp[i]= c->hadamard8_diff[i];
3117
            break;
3118
        case FF_CMP_SSE:
3119
            cmp[i]= c->sse[i];
3120
            break;
3121
        case FF_CMP_DCT:
3122
            cmp[i]= c->dct_sad[i];
3123
            break;
3124
        case FF_CMP_DCT264:
3125
            cmp[i]= c->dct264_sad[i];
3126
            break;
3127
        case FF_CMP_DCTMAX:
3128
            cmp[i]= c->dct_max[i];
3129
            break;
3130
        case FF_CMP_PSNR:
3131
            cmp[i]= c->quant_psnr[i];
3132
            break;
3133
        case FF_CMP_BIT:
3134
            cmp[i]= c->bit[i];
3135
            break;
3136
        case FF_CMP_RD:
3137
            cmp[i]= c->rd[i];
3138
            break;
3139
        case FF_CMP_VSAD:
3140
            cmp[i]= c->vsad[i];
3141
            break;
3142
        case FF_CMP_VSSE:
3143
            cmp[i]= c->vsse[i];
3144
            break;
3145
        case FF_CMP_ZERO:
3146
            cmp[i]= zero_cmp;
3147
            break;
3148
        case FF_CMP_NSSE:
3149
            cmp[i]= c->nsse[i];
3150
            break;
3151
#if CONFIG_DWT
3152
        case FF_CMP_W53:
3153
            cmp[i]= c->w53[i];
3154
            break;
3155
        case FF_CMP_W97:
3156
            cmp[i]= c->w97[i];
3157
            break;
3158
#endif
3159
        default:
3160
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3161
        }
3162
    }
3163
}
3164

    
3165
static void clear_block_c(DCTELEM *block)
3166
{
3167
    memset(block, 0, sizeof(DCTELEM)*64);
3168
}
3169

    
3170
/**
3171
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3172
 */
3173
static void clear_blocks_c(DCTELEM *blocks)
3174
{
3175
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3176
}
3177

    
3178
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3179
    long i;
3180
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3181
        long a = *(long*)(src+i);
3182
        long b = *(long*)(dst+i);
3183
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3184
    }
3185
    for(; i<w; i++)
3186
        dst[i+0] += src[i+0];
3187
}
3188

    
3189
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3190
    long i;
3191
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3192
        long a = *(long*)(src1+i);
3193
        long b = *(long*)(src2+i);
3194
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3195
    }
3196
    for(; i<w; i++)
3197
        dst[i] = src1[i]+src2[i];
3198
}
3199

    
3200
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3201
    long i;
3202
#if !HAVE_FAST_UNALIGNED
3203
    if((long)src2 & (sizeof(long)-1)){
3204
        for(i=0; i+7<w; i+=8){
3205
            dst[i+0] = src1[i+0]-src2[i+0];
3206
            dst[i+1] = src1[i+1]-src2[i+1];
3207
            dst[i+2] = src1[i+2]-src2[i+2];
3208
            dst[i+3] = src1[i+3]-src2[i+3];
3209
            dst[i+4] = src1[i+4]-src2[i+4];
3210
            dst[i+5] = src1[i+5]-src2[i+5];
3211
            dst[i+6] = src1[i+6]-src2[i+6];
3212
            dst[i+7] = src1[i+7]-src2[i+7];
3213
        }
3214
    }else
3215
#endif
3216
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3217
        long a = *(long*)(src1+i);
3218
        long b = *(long*)(src2+i);
3219
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3220
    }
3221
    for(; i<w; i++)
3222
        dst[i+0] = src1[i+0]-src2[i+0];
3223
}
3224

    
3225
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3226
    int i;
3227
    uint8_t l, lt;
3228

    
3229
    l= *left;
3230
    lt= *left_top;
3231

    
3232
    for(i=0; i<w; i++){
3233
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3234
        lt= src1[i];
3235
        dst[i]= l;
3236
    }
3237

    
3238
    *left= l;
3239
    *left_top= lt;
3240
}
3241

    
3242
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3243
    int i;
3244
    uint8_t l, lt;
3245

    
3246
    l= *left;
3247
    lt= *left_top;
3248

    
3249
    for(i=0; i<w; i++){
3250
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3251
        lt= src1[i];
3252
        l= src2[i];
3253
        dst[i]= l - pred;
3254
    }
3255

    
3256
    *left= l;
3257
    *left_top= lt;
3258
}
3259

    
3260
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3261
    int i;
3262

    
3263
    for(i=0; i<w-1; i++){
3264
        acc+= src[i];
3265
        dst[i]= acc;
3266
        i++;
3267
        acc+= src[i];
3268
        dst[i]= acc;
3269
    }
3270

    
3271
    for(; i<w; i++){
3272
        acc+= src[i];
3273
        dst[i]= acc;
3274
    }
3275

    
3276
    return acc;
3277
}
3278

    
3279
#if HAVE_BIGENDIAN
3280
#define B 3
3281
#define G 2
3282
#define R 1
3283
#define A 0
3284
#else
3285
#define B 0
3286
#define G 1
3287
#define R 2
3288
#define A 3
3289
#endif
3290
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3291
    int i;
3292
    int r,g,b,a;
3293
    r= *red;
3294
    g= *green;
3295
    b= *blue;
3296
    a= *alpha;
3297

    
3298
    for(i=0; i<w; i++){
3299
        b+= src[4*i+B];
3300
        g+= src[4*i+G];
3301
        r+= src[4*i+R];
3302
        a+= src[4*i+A];
3303

    
3304
        dst[4*i+B]= b;
3305
        dst[4*i+G]= g;
3306
        dst[4*i+R]= r;
3307
        dst[4*i+A]= a;
3308
    }
3309

    
3310
    *red= r;
3311
    *green= g;
3312
    *blue= b;
3313
    *alpha= a;
3314
}
3315
#undef B
3316
#undef G
3317
#undef R
3318
#undef A
3319

    
3320
#define BUTTERFLY2(o1,o2,i1,i2) \
3321
o1= (i1)+(i2);\
3322
o2= (i1)-(i2);
3323

    
3324
#define BUTTERFLY1(x,y) \
3325
{\
3326
    int a,b;\
3327
    a= x;\
3328
    b= y;\
3329
    x= a+b;\
3330
    y= a-b;\
3331
}
3332

    
3333
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3334

    
3335
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3336
    int i;
3337
    int temp[64];
3338
    int sum=0;
3339

    
3340
    assert(h==8);
3341

    
3342
    for(i=0; i<8; i++){
3343
        //FIXME try pointer walks
3344
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3345
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3346
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3347
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3348

    
3349
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3350
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3351
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3352
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3353

    
3354
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3355
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3356
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3357
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3358
    }
3359

    
3360
    for(i=0; i<8; i++){
3361
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3362
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3363
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3364
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3365

    
3366
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3367
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3368
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3369
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3370

    
3371
        sum +=
3372
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3373
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3374
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3375
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3376
    }
3377
#if 0
3378
static int maxi=0;
3379
if(sum>maxi){
3380
    maxi=sum;
3381
    printf("MAX:%d\n", maxi);
3382
}
3383
#endif
3384
    return sum;
3385
}
3386

    
3387
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3388
    int i;
3389
    int temp[64];
3390
    int sum=0;
3391

    
3392
    assert(h==8);
3393

    
3394
    for(i=0; i<8; i++){
3395
        //FIXME try pointer walks
3396
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3397
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3398
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3399
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3400

    
3401
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3402
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3403
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3404
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3405

    
3406
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3407
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3408
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3409
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3410
    }
3411

    
3412
    for(i=0; i<8; i++){
3413
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3414
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3415
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3416
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3417

    
3418
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3419
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3420
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3421
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3422

    
3423
        sum +=
3424
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3425
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3426
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3427
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3428
    }
3429

    
3430
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3431

    
3432
    return sum;
3433
}
3434

    
3435
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3436
    MpegEncContext * const s= (MpegEncContext *)c;
3437
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3438

    
3439
    assert(h==8);
3440

    
3441
    s->dsp.diff_pixels(temp, src1, src2, stride);
3442
    s->dsp.fdct(temp);
3443
    return s->dsp.sum_abs_dctelem(temp);
3444
}
3445

    
3446
#if CONFIG_GPL
3447
#define DCT8_1D {\
3448
    const int s07 = SRC(0) + SRC(7);\
3449
    const int s16 = SRC(1) + SRC(6);\
3450
    const int s25 = SRC(2) + SRC(5);\
3451
    const int s34 = SRC(3) + SRC(4);\
3452
    const int a0 = s07 + s34;\
3453
    const int a1 = s16 + s25;\
3454
    const int a2 = s07 - s34;\
3455
    const int a3 = s16 - s25;\
3456
    const int d07 = SRC(0) - SRC(7);\
3457
    const int d16 = SRC(1) - SRC(6);\
3458
    const int d25 = SRC(2) - SRC(5);\
3459
    const int d34 = SRC(3) - SRC(4);\
3460
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3461
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3462
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3463
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3464
    DST(0,  a0 + a1     ) ;\
3465
    DST(1,  a4 + (a7>>2)) ;\
3466
    DST(2,  a2 + (a3>>1)) ;\
3467
    DST(3,  a5 + (a6>>2)) ;\
3468
    DST(4,  a0 - a1     ) ;\
3469
    DST(5,  a6 - (a5>>2)) ;\
3470
    DST(6, (a2>>1) - a3 ) ;\
3471
    DST(7, (a4>>2) - a7 ) ;\
3472
}
3473

    
3474
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3475
    MpegEncContext * const s= (MpegEncContext *)c;
3476
    DCTELEM dct[8][8];
3477
    int i;
3478
    int sum=0;
3479

    
3480
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3481

    
3482
#define SRC(x) dct[i][x]
3483
#define DST(x,v) dct[i][x]= v
3484
    for( i = 0; i < 8; i++ )
3485
        DCT8_1D
3486
#undef SRC
3487
#undef DST
3488

    
3489
#define SRC(x) dct[x][i]
3490
#define DST(x,v) sum += FFABS(v)
3491
    for( i = 0; i < 8; i++ )
3492
        DCT8_1D
3493
#undef SRC
3494
#undef DST
3495
    return sum;
3496
}
3497
#endif
3498

    
3499
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3500
    MpegEncContext * const s= (MpegEncContext *)c;
3501
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3502
    int sum=0, i;
3503

    
3504
    assert(h==8);
3505

    
3506
    s->dsp.diff_pixels(temp, src1, src2, stride);
3507
    s->dsp.fdct(temp);
3508

    
3509
    for(i=0; i<64; i++)
3510
        sum= FFMAX(sum, FFABS(temp[i]));
3511

    
3512
    return sum;
3513
}
3514

    
3515
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3516
    MpegEncContext * const s= (MpegEncContext *)c;
3517
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3518
    DCTELEM * const bak = temp+64;
3519
    int sum=0, i;
3520

    
3521
    assert(h==8);
3522
    s->mb_intra=0;
3523

    
3524
    s->dsp.diff_pixels(temp, src1, src2, stride);
3525

    
3526
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3527

    
3528
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3529
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3530
    ff_simple_idct(temp); //FIXME
3531

    
3532
    for(i=0; i<64; i++)
3533
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3534

    
3535
    return sum;
3536
}
3537

    
3538
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3539
    MpegEncContext * const s= (MpegEncContext *)c;
3540
    const uint8_t *scantable= s->intra_scantable.permutated;
3541
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3542
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3543
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3544
    int i, last, run, bits, level, distortion, start_i;
3545
    const int esc_length= s->ac_esc_length;
3546
    uint8_t * length;
3547
    uint8_t * last_length;
3548

    
3549
    assert(h==8);
3550

    
3551
    copy_block8(lsrc1, src1, 8, stride, 8);
3552
    copy_block8(lsrc2, src2, 8, stride, 8);
3553

    
3554
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3555

    
3556
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3557

    
3558
    bits=0;
3559

    
3560
    if (s->mb_intra) {
3561
        start_i = 1;
3562
        length     = s->intra_ac_vlc_length;
3563
        last_length= s->intra_ac_vlc_last_length;
3564
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3565
    } else {
3566
        start_i = 0;
3567
        length     = s->inter_ac_vlc_length;
3568
        last_length= s->inter_ac_vlc_last_length;
3569
    }
3570

    
3571
    if(last>=start_i){
3572
        run=0;
3573
        for(i=start_i; i<last; i++){
3574
            int j= scantable[i];
3575
            level= temp[j];
3576

    
3577
            if(level){
3578
                level+=64;
3579
                if((level&(~127)) == 0){
3580
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3581
                }else
3582
                    bits+= esc_length;
3583
                run=0;
3584
            }else
3585
                run++;
3586
        }
3587
        i= scantable[last];
3588

    
3589
        level= temp[i] + 64;
3590

    
3591
        assert(level - 64);
3592

    
3593
        if((level&(~127)) == 0){
3594
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3595
        }else
3596
            bits+= esc_length;
3597

    
3598
    }
3599

    
3600
    if(last>=0){
3601
        if(s->mb_intra)
3602
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3603
        else
3604
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3605
    }
3606

    
3607
    s->dsp.idct_add(lsrc2, 8, temp);
3608

    
3609
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3610

    
3611
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3612
}
3613

    
3614
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3615
    MpegEncContext * const s= (MpegEncContext *)c;
3616
    const uint8_t *scantable= s->intra_scantable.permutated;
3617
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3618
    int i, last, run, bits, level, start_i;
3619
    const int esc_length= s->ac_esc_length;
3620
    uint8_t * length;
3621
    uint8_t * last_length;
3622

    
3623
    assert(h==8);
3624

    
3625
    s->dsp.diff_pixels(temp, src1, src2, stride);
3626

    
3627
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3628

    
3629
    bits=0;
3630

    
3631
    if (s->mb_intra) {
3632
        start_i = 1;
3633
        length     = s->intra_ac_vlc_length;
3634
        last_length= s->intra_ac_vlc_last_length;
3635
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3636
    } else {
3637
        start_i = 0;
3638
        length     = s->inter_ac_vlc_length;
3639
        last_length= s->inter_ac_vlc_last_length;
3640
    }
3641

    
3642
    if(last>=start_i){
3643
        run=0;
3644
        for(i=start_i; i<last; i++){
3645
            int j= scantable[i];
3646
            level= temp[j];
3647

    
3648
            if(level){
3649
                level+=64;
3650
                if((level&(~127)) == 0){
3651
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3652
                }else
3653
                    bits+= esc_length;
3654
                run=0;
3655
            }else
3656
                run++;
3657
        }
3658
        i= scantable[last];
3659

    
3660
        level= temp[i] + 64;
3661

    
3662
        assert(level - 64);
3663

    
3664
        if((level&(~127)) == 0){
3665
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3666
        }else
3667
            bits+= esc_length;
3668
    }
3669

    
3670
    return bits;
3671
}
3672

    
3673
#define VSAD_INTRA(size) \
3674
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3675
    int score=0;                                                                                            \
3676
    int x,y;                                                                                                \
3677
                                                                                                            \
3678
    for(y=1; y<h; y++){                                                                                     \
3679
        for(x=0; x<size; x+=4){                                                                             \
3680
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3681
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3682
        }                                                                                                   \
3683
        s+= stride;                                                                                         \
3684
    }                                                                                                       \
3685
                                                                                                            \
3686
    return score;                                                                                           \
3687
}
3688
VSAD_INTRA(8)
3689
VSAD_INTRA(16)
3690

    
3691
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3692
    int score=0;
3693
    int x,y;
3694

    
3695
    for(y=1; y<h; y++){
3696
        for(x=0; x<16; x++){
3697
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3698
        }
3699
        s1+= stride;
3700
        s2+= stride;
3701
    }
3702

    
3703
    return score;
3704
}
3705

    
3706
#define SQ(a) ((a)*(a))
3707
#define VSSE_INTRA(size) \
3708
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3709
    int score=0;                                                                                            \
3710
    int x,y;                                                                                                \
3711
                                                                                                            \
3712
    for(y=1; y<h; y++){                                                                                     \
3713
        for(x=0; x<size; x+=4){                                                                               \
3714
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3715
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3716
        }                                                                                                   \
3717
        s+= stride;                                                                                         \
3718
    }                                                                                                       \
3719
                                                                                                            \
3720
    return score;                                                                                           \
3721
}
3722
VSSE_INTRA(8)
3723
VSSE_INTRA(16)
3724

    
3725
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3726
    int score=0;
3727
    int x,y;
3728

    
3729
    for(y=1; y<h; y++){
3730
        for(x=0; x<16; x++){
3731
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3732
        }
3733
        s1+= stride;
3734
        s2+= stride;
3735
    }
3736

    
3737
    return score;
3738
}
3739

    
3740
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3741
                               int size){
3742
    int score=0;
3743
    int i;
3744
    for(i=0; i<size; i++)
3745
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3746
    return score;
3747
}
3748

    
3749
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3750
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3751
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3752
#if CONFIG_GPL
3753
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3754
#endif
3755
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3756
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3757
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3758
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3759

    
3760
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3761
    int i;
3762
    for(i=0; i<len; i++)
3763
        dst[i] = src0[i] * src1[i];
3764
}
3765

    
3766
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3767
    int i;
3768
    src1 += len-1;
3769
    for(i=0; i<len; i++)
3770
        dst[i] = src0[i] * src1[-i];
3771
}
3772

    
3773
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3774
    int i;
3775
    for(i=0; i<len; i++)
3776
        dst[i] = src0[i] * src1[i] + src2[i];
3777
}
3778

    
3779
static void vector_fmul_window_c(float *dst, const float *src0,
3780
                                 const float *src1, const float *win, int len)
3781
{
3782
    int i,j;
3783
    dst += len;
3784
    win += len;
3785
    src0+= len;
3786
    for(i=-len, j=len-1; i<0; i++, j--) {
3787
        float s0 = src0[i];
3788
        float s1 = src1[j];
3789
        float wi = win[i];
3790
        float wj = win[j];
3791
        dst[i] = s0*wj - s1*wi;
3792
        dst[j] = s0*wi + s1*wj;
3793
    }
3794
}
3795

    
3796
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3797
                                 int len)
3798
{
3799
    int i;
3800
    for (i = 0; i < len; i++)
3801
        dst[i] = src[i] * mul;
3802
}
3803

    
3804
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3805
                                      const float **sv, float mul, int len)
3806
{
3807
    int i;
3808
    for (i = 0; i < len; i += 2, sv++) {
3809
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3810
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3811
    }
3812
}
3813

    
3814
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3815
                                      const float **sv, float mul, int len)
3816
{
3817
    int i;
3818
    for (i = 0; i < len; i += 4, sv++) {
3819
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3820
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3821
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3822
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3823
    }
3824
}
3825

    
3826
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3827
                               int len)
3828
{
3829
    int i;
3830
    for (i = 0; i < len; i += 2, sv++) {
3831
        dst[i  ] = sv[0][0] * mul;
3832
        dst[i+1] = sv[0][1] * mul;
3833
    }
3834
}
3835

    
3836
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3837
                               int len)
3838
{
3839
    int i;
3840
    for (i = 0; i < len; i += 4, sv++) {
3841
        dst[i  ] = sv[0][0] * mul;
3842
        dst[i+1] = sv[0][1] * mul;
3843
        dst[i+2] = sv[0][2] * mul;
3844
        dst[i+3] = sv[0][3] * mul;
3845
    }
3846
}
3847

    
3848
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3849
                                int len)
3850
{
3851
    int i;
3852
    for (i = 0; i < len; i++) {
3853
        float t = v1[i] - v2[i];
3854
        v1[i] += v2[i];
3855
        v2[i] = t;
3856
    }
3857
}
3858

    
3859
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3860
{
3861
    float p = 0.0;
3862
    int i;
3863

    
3864
    for (i = 0; i < len; i++)
3865
        p += v1[i] * v2[i];
3866

    
3867
    return p;
3868
}
3869

    
3870
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3871
                   uint32_t maxi, uint32_t maxisign)
3872
{
3873

    
3874
    if(a > mini) return mini;
3875
    else if((a^(1<<31)) > maxisign) return maxi;
3876
    else return a;
3877
}
3878

    
3879
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3880
    int i;
3881
    uint32_t mini = *(uint32_t*)min;
3882
    uint32_t maxi = *(uint32_t*)max;
3883
    uint32_t maxisign = maxi ^ (1<<31);
3884
    uint32_t *dsti = (uint32_t*)dst;
3885
    const uint32_t *srci = (const uint32_t*)src;
3886
    for(i=0; i<len; i+=8) {
3887
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3888
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3889
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3890
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3891
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3892
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3893
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3894
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3895
    }
3896
}
3897
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3898
    int i;
3899
    if(min < 0 && max > 0) {
3900
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3901
    } else {
3902
        for(i=0; i < len; i+=8) {
3903
            dst[i    ] = av_clipf(src[i    ], min, max);
3904
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3905
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3906
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3907
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3908
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3909
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3910
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3911
        }
3912
    }
3913
}
3914

    
3915
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3916
{
3917
    int res = 0;
3918

    
3919
    while (order--)
3920
        res += (*v1++ * *v2++) >> shift;
3921

    
3922
    return res;
3923
}
3924

    
3925
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3926
{
3927
    int res = 0;
3928
    while (order--) {
3929
        res   += *v1 * *v2++;
3930
        *v1++ += mul * *v3++;
3931
    }
3932
    return res;
3933
}
3934

    
3935
#define W0 2048
3936
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3937
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3938
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3939
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3940
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3941
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3942
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3943

    
3944
static void wmv2_idct_row(short * b)
3945
{
3946
    int s1,s2;
3947
    int a0,a1,a2,a3,a4,a5,a6,a7;
3948
    /*step 1*/
3949
    a1 = W1*b[1]+W7*b[7];
3950
    a7 = W7*b[1]-W1*b[7];
3951
    a5 = W5*b[5]+W3*b[3];
3952
    a3 = W3*b[5]-W5*b[3];
3953
    a2 = W2*b[2]+W6*b[6];
3954
    a6 = W6*b[2]-W2*b[6];
3955
    a0 = W0*b[0]+W0*b[4];
3956
    a4 = W0*b[0]-W0*b[4];
3957
    /*step 2*/
3958
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3959
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3960
    /*step 3*/
3961
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3962
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3963
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3964
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3965
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3966
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3967
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3968
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3969
}
3970
static void wmv2_idct_col(short * b)
3971
{
3972
    int s1,s2;
3973
    int a0,a1,a2,a3,a4,a5,a6,a7;
3974
    /*step 1, with extended precision*/
3975
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3976
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3977
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3978
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3979
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3980
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3981
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3982
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3983
    /*step 2*/
3984
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3985
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3986
    /*step 3*/
3987
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3988
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3989
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3990
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3991

    
3992
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3993
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3994
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3995
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3996
}
3997
void ff_wmv2_idct_c(short * block){
3998
    int i;
3999

    
4000
    for(i=0;i<64;i+=8){
4001
        wmv2_idct_row(block+i);
4002
    }
4003
    for(i=0;i<8;i++){
4004
        wmv2_idct_col(block+i);
4005
    }
4006
}
4007
/* XXX: those functions should be suppressed ASAP when all IDCTs are
4008
 converted */
4009
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4010
{
4011
    ff_wmv2_idct_c(block);
4012
    put_pixels_clamped_c(block, dest, line_size);
4013
}
4014
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4015
{
4016
    ff_wmv2_idct_c(block);
4017
    add_pixels_clamped_c(block, dest, line_size);
4018
}
4019
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4020
{
4021
    j_rev_dct (block);
4022
    put_pixels_clamped_c(block, dest, line_size);
4023
}
4024
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4025
{
4026
    j_rev_dct (block);
4027
    add_pixels_clamped_c(block, dest, line_size);
4028
}
4029

    
4030
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4031
{
4032
    j_rev_dct4 (block);
4033
    put_pixels_clamped4_c(block, dest, line_size);
4034
}
4035
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4036
{
4037
    j_rev_dct4 (block);
4038
    add_pixels_clamped4_c(block, dest, line_size);
4039
}
4040

    
4041
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4042
{
4043
    j_rev_dct2 (block);
4044
    put_pixels_clamped2_c(block, dest, line_size);
4045
}
4046
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4047
{
4048
    j_rev_dct2 (block);
4049
    add_pixels_clamped2_c(block, dest, line_size);
4050
}
4051

    
4052
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4053
{
4054
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4055

    
4056
    dest[0] = cm[(block[0] + 4)>>3];
4057
}
4058
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4059
{
4060
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4061

    
4062
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4063
}
4064

    
4065
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4066

    
4067
/* init static data */
4068
av_cold void dsputil_static_init(void)
4069
{
4070
    int i;
4071

    
4072
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4073
    for(i=0;i<MAX_NEG_CROP;i++) {
4074
        ff_cropTbl[i] = 0;
4075
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4076
    }
4077

    
4078
    for(i=0;i<512;i++) {
4079
        ff_squareTbl[i] = (i - 256) * (i - 256);
4080
    }
4081

    
4082
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4083
}
4084

    
4085
int ff_check_alignment(void){
4086
    static int did_fail=0;
4087
    DECLARE_ALIGNED(16, int, aligned);
4088

    
4089
    if((intptr_t)&aligned & 15){
4090
        if(!did_fail){
4091
#if HAVE_MMX || HAVE_ALTIVEC
4092
            av_log(NULL, AV_LOG_ERROR,
4093
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4094
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4095
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4096
                "Do not report crashes to FFmpeg developers.\n");
4097
#endif
4098
            did_fail=1;
4099
        }
4100
        return -1;
4101
    }
4102
    return 0;
4103
}
4104

    
4105
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4106
{
4107
    int i;
4108

    
4109
    ff_check_alignment();
4110

    
4111
#if CONFIG_ENCODERS
4112
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4113
        c->fdct = fdct_ifast;
4114
        c->fdct248 = fdct_ifast248;
4115
    }
4116
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4117
        c->fdct = ff_faandct;
4118
        c->fdct248 = ff_faandct248;
4119
    }
4120
    else {
4121
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4122
        c->fdct248 = ff_fdct248_islow;
4123
    }
4124
#endif //CONFIG_ENCODERS
4125

    
4126
    if(avctx->lowres==1){
4127
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4128
            c->idct_put= ff_jref_idct4_put;
4129
            c->idct_add= ff_jref_idct4_add;
4130
        }else{
4131
            c->idct_put= ff_h264_lowres_idct_put_c;
4132
            c->idct_add= ff_h264_lowres_idct_add_c;
4133
        }
4134
        c->idct    = j_rev_dct4;
4135
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4136
    }else if(avctx->lowres==2){
4137
        c->idct_put= ff_jref_idct2_put;
4138
        c->idct_add= ff_jref_idct2_add;
4139
        c->idct    = j_rev_dct2;
4140
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4141
    }else if(avctx->lowres==3){
4142
        c->idct_put= ff_jref_idct1_put;
4143
        c->idct_add= ff_jref_idct1_add;
4144
        c->idct    = j_rev_dct1;
4145
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4146
    }else{
4147
        if(avctx->idct_algo==FF_IDCT_INT){
4148
            c->idct_put= ff_jref_idct_put;
4149
            c->idct_add= ff_jref_idct_add;
4150
            c->idct    = j_rev_dct;
4151
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4152
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4153
                avctx->idct_algo==FF_IDCT_VP3){
4154
            c->idct_put= ff_vp3_idct_put_c;
4155
            c->idct_add= ff_vp3_idct_add_c;
4156
            c->idct    = ff_vp3_idct_c;
4157
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4158
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4159
            c->idct_put= ff_wmv2_idct_put_c;
4160
            c->idct_add= ff_wmv2_idct_add_c;
4161
            c->idct    = ff_wmv2_idct_c;
4162
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4163
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4164
            c->idct_put= ff_faanidct_put;
4165
            c->idct_add= ff_faanidct_add;
4166
            c->idct    = ff_faanidct;
4167
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4168
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4169
            c->idct_put= ff_ea_idct_put_c;
4170
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4171
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4172
            c->idct     = ff_bink_idct_c;
4173
            c->idct_add = ff_bink_idct_add_c;
4174
            c->idct_put = ff_bink_idct_put_c;
4175
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4176
        }else{ //accurate/default
4177
            c->idct_put= ff_simple_idct_put;
4178
            c->idct_add= ff_simple_idct_add;
4179
            c->idct    = ff_simple_idct;
4180
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4181
        }
4182
    }
4183

    
4184
    c->get_pixels = get_pixels_c;
4185
    c->diff_pixels = diff_pixels_c;
4186
    c->put_pixels_clamped = put_pixels_clamped_c;
4187
    c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4188
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4189
    c->add_pixels_clamped = add_pixels_clamped_c;
4190
    c->add_pixels8 = add_pixels8_c;
4191
    c->add_pixels4 = add_pixels4_c;
4192
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4193
    c->emulated_edge_mc = ff_emulated_edge_mc;
4194
    c->gmc1 = gmc1_c;
4195
    c->gmc = ff_gmc_c;
4196
    c->clear_block = clear_block_c;
4197
    c->clear_blocks = clear_blocks_c;
4198
    c->pix_sum = pix_sum_c;
4199
    c->pix_norm1 = pix_norm1_c;
4200

    
4201
    c->fill_block_tab[0] = fill_block16_c;
4202
    c->fill_block_tab[1] = fill_block8_c;
4203
    c->scale_block = scale_block_c;
4204

    
4205
    /* TODO [0] 16  [1] 8 */
4206
    c->pix_abs[0][0] = pix_abs16_c;
4207
    c->pix_abs[0][1] = pix_abs16_x2_c;
4208
    c->pix_abs[0][2] = pix_abs16_y2_c;
4209
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4210
    c->pix_abs[1][0] = pix_abs8_c;
4211
    c->pix_abs[1][1] = pix_abs8_x2_c;
4212
    c->pix_abs[1][2] = pix_abs8_y2_c;
4213
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4214

    
4215
#define dspfunc(PFX, IDX, NUM) \
4216
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4217
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4218
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4219
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4220

    
4221
    dspfunc(put, 0, 16);
4222
    dspfunc(put_no_rnd, 0, 16);
4223
    dspfunc(put, 1, 8);
4224
    dspfunc(put_no_rnd, 1, 8);
4225
    dspfunc(put, 2, 4);
4226
    dspfunc(put, 3, 2);
4227

    
4228
    dspfunc(avg, 0, 16);
4229
    dspfunc(avg_no_rnd, 0, 16);
4230
    dspfunc(avg, 1, 8);
4231
    dspfunc(avg_no_rnd, 1, 8);
4232
    dspfunc(avg, 2, 4);
4233
    dspfunc(avg, 3, 2);
4234
#undef dspfunc
4235

    
4236
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4237
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4238

    
4239
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4240
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4241
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4242
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4243
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4244
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4245
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4246
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4247
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4248

    
4249
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4250
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4251
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4252
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4253
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4254
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4255
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4256
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4257
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4258

    
4259
#define dspfunc(PFX, IDX, NUM) \
4260
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4261
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4262
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4263
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4264
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4265
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4266
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4267
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4268
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4269
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4270
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4271
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4272
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4273
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4274
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4275
    c->PFX ## _pixels_tab[IDX][15] = PFX #