Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ e6e98234

History | View | Annotate | Download (155 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of Libav.
9
 *
10
 * Libav is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * Libav is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with Libav; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47
#define pb_7f (~0UL/255 * 0x7f)
48
#define pb_80 (~0UL/255 * 0x80)
49

    
50
const uint8_t ff_zigzag_direct[64] = {
51
    0,   1,  8, 16,  9,  2,  3, 10,
52
    17, 24, 32, 25, 18, 11,  4,  5,
53
    12, 19, 26, 33, 40, 48, 41, 34,
54
    27, 20, 13,  6,  7, 14, 21, 28,
55
    35, 42, 49, 56, 57, 50, 43, 36,
56
    29, 22, 15, 23, 30, 37, 44, 51,
57
    58, 59, 52, 45, 38, 31, 39, 46,
58
    53, 60, 61, 54, 47, 55, 62, 63
59
};
60

    
61
/* Specific zigzag scan for 248 idct. NOTE that unlike the
62
   specification, we interleave the fields */
63
const uint8_t ff_zigzag248_direct[64] = {
64
     0,  8,  1,  9, 16, 24,  2, 10,
65
    17, 25, 32, 40, 48, 56, 33, 41,
66
    18, 26,  3, 11,  4, 12, 19, 27,
67
    34, 42, 49, 57, 50, 58, 35, 43,
68
    20, 28,  5, 13,  6, 14, 21, 29,
69
    36, 44, 51, 59, 52, 60, 37, 45,
70
    22, 30,  7, 15, 23, 31, 38, 46,
71
    53, 61, 54, 62, 39, 47, 55, 63,
72
};
73

    
74
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76

    
77
const uint8_t ff_alternate_horizontal_scan[64] = {
78
    0,  1,   2,  3,  8,  9, 16, 17,
79
    10, 11,  4,  5,  6,  7, 15, 14,
80
    13, 12, 19, 18, 24, 25, 32, 33,
81
    26, 27, 20, 21, 22, 23, 28, 29,
82
    30, 31, 34, 35, 40, 41, 48, 49,
83
    42, 43, 36, 37, 38, 39, 44, 45,
84
    46, 47, 50, 51, 56, 57, 58, 59,
85
    52, 53, 54, 55, 60, 61, 62, 63,
86
};
87

    
88
const uint8_t ff_alternate_vertical_scan[64] = {
89
    0,  8,  16, 24,  1,  9,  2, 10,
90
    17, 25, 32, 40, 48, 56, 57, 49,
91
    41, 33, 26, 18,  3, 11,  4, 12,
92
    19, 27, 34, 42, 50, 58, 35, 43,
93
    51, 59, 20, 28,  5, 13,  6, 14,
94
    21, 29, 36, 44, 52, 60, 37, 45,
95
    53, 61, 22, 30,  7, 15, 23, 31,
96
    38, 46, 54, 62, 39, 47, 55, 63,
97
};
98

    
99
/* Input permutation for the simple_idct_mmx */
100
static const uint8_t simple_mmx_permutation[64]={
101
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109
};
110

    
111
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
112

    
113
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114
    int i;
115
    int end;
116

    
117
    st->scantable= src_scantable;
118

    
119
    for(i=0; i<64; i++){
120
        int j;
121
        j = src_scantable[i];
122
        st->permutated[i] = permutation[j];
123
#if ARCH_PPC
124
        st->inverse[j] = i;
125
#endif
126
    }
127

    
128
    end=-1;
129
    for(i=0; i<64; i++){
130
        int j;
131
        j = st->permutated[i];
132
        if(j>end) end=j;
133
        st->raster_end[i]= end;
134
    }
135
}
136

    
137
static int pix_sum_c(uint8_t * pix, int line_size)
138
{
139
    int s, i, j;
140

    
141
    s = 0;
142
    for (i = 0; i < 16; i++) {
143
        for (j = 0; j < 16; j += 8) {
144
            s += pix[0];
145
            s += pix[1];
146
            s += pix[2];
147
            s += pix[3];
148
            s += pix[4];
149
            s += pix[5];
150
            s += pix[6];
151
            s += pix[7];
152
            pix += 8;
153
        }
154
        pix += line_size - 16;
155
    }
156
    return s;
157
}
158

    
159
static int pix_norm1_c(uint8_t * pix, int line_size)
160
{
161
    int s, i, j;
162
    uint32_t *sq = ff_squareTbl + 256;
163

    
164
    s = 0;
165
    for (i = 0; i < 16; i++) {
166
        for (j = 0; j < 16; j += 8) {
167
#if 0
168
            s += sq[pix[0]];
169
            s += sq[pix[1]];
170
            s += sq[pix[2]];
171
            s += sq[pix[3]];
172
            s += sq[pix[4]];
173
            s += sq[pix[5]];
174
            s += sq[pix[6]];
175
            s += sq[pix[7]];
176
#else
177
#if LONG_MAX > 2147483647
178
            register uint64_t x=*(uint64_t*)pix;
179
            s += sq[x&0xff];
180
            s += sq[(x>>8)&0xff];
181
            s += sq[(x>>16)&0xff];
182
            s += sq[(x>>24)&0xff];
183
            s += sq[(x>>32)&0xff];
184
            s += sq[(x>>40)&0xff];
185
            s += sq[(x>>48)&0xff];
186
            s += sq[(x>>56)&0xff];
187
#else
188
            register uint32_t x=*(uint32_t*)pix;
189
            s += sq[x&0xff];
190
            s += sq[(x>>8)&0xff];
191
            s += sq[(x>>16)&0xff];
192
            s += sq[(x>>24)&0xff];
193
            x=*(uint32_t*)(pix+4);
194
            s += sq[x&0xff];
195
            s += sq[(x>>8)&0xff];
196
            s += sq[(x>>16)&0xff];
197
            s += sq[(x>>24)&0xff];
198
#endif
199
#endif
200
            pix += 8;
201
        }
202
        pix += line_size - 16;
203
    }
204
    return s;
205
}
206

    
207
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
208
    int i;
209

    
210
    for(i=0; i+8<=w; i+=8){
211
        dst[i+0]= av_bswap32(src[i+0]);
212
        dst[i+1]= av_bswap32(src[i+1]);
213
        dst[i+2]= av_bswap32(src[i+2]);
214
        dst[i+3]= av_bswap32(src[i+3]);
215
        dst[i+4]= av_bswap32(src[i+4]);
216
        dst[i+5]= av_bswap32(src[i+5]);
217
        dst[i+6]= av_bswap32(src[i+6]);
218
        dst[i+7]= av_bswap32(src[i+7]);
219
    }
220
    for(;i<w; i++){
221
        dst[i+0]= av_bswap32(src[i+0]);
222
    }
223
}
224

    
225
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
226
{
227
    while (len--)
228
        *dst++ = av_bswap16(*src++);
229
}
230

    
231
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232
{
233
    int s, i;
234
    uint32_t *sq = ff_squareTbl + 256;
235

    
236
    s = 0;
237
    for (i = 0; i < h; i++) {
238
        s += sq[pix1[0] - pix2[0]];
239
        s += sq[pix1[1] - pix2[1]];
240
        s += sq[pix1[2] - pix2[2]];
241
        s += sq[pix1[3] - pix2[3]];
242
        pix1 += line_size;
243
        pix2 += line_size;
244
    }
245
    return s;
246
}
247

    
248
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249
{
250
    int s, i;
251
    uint32_t *sq = ff_squareTbl + 256;
252

    
253
    s = 0;
254
    for (i = 0; i < h; i++) {
255
        s += sq[pix1[0] - pix2[0]];
256
        s += sq[pix1[1] - pix2[1]];
257
        s += sq[pix1[2] - pix2[2]];
258
        s += sq[pix1[3] - pix2[3]];
259
        s += sq[pix1[4] - pix2[4]];
260
        s += sq[pix1[5] - pix2[5]];
261
        s += sq[pix1[6] - pix2[6]];
262
        s += sq[pix1[7] - pix2[7]];
263
        pix1 += line_size;
264
        pix2 += line_size;
265
    }
266
    return s;
267
}
268

    
269
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270
{
271
    int s, i;
272
    uint32_t *sq = ff_squareTbl + 256;
273

    
274
    s = 0;
275
    for (i = 0; i < h; i++) {
276
        s += sq[pix1[ 0] - pix2[ 0]];
277
        s += sq[pix1[ 1] - pix2[ 1]];
278
        s += sq[pix1[ 2] - pix2[ 2]];
279
        s += sq[pix1[ 3] - pix2[ 3]];
280
        s += sq[pix1[ 4] - pix2[ 4]];
281
        s += sq[pix1[ 5] - pix2[ 5]];
282
        s += sq[pix1[ 6] - pix2[ 6]];
283
        s += sq[pix1[ 7] - pix2[ 7]];
284
        s += sq[pix1[ 8] - pix2[ 8]];
285
        s += sq[pix1[ 9] - pix2[ 9]];
286
        s += sq[pix1[10] - pix2[10]];
287
        s += sq[pix1[11] - pix2[11]];
288
        s += sq[pix1[12] - pix2[12]];
289
        s += sq[pix1[13] - pix2[13]];
290
        s += sq[pix1[14] - pix2[14]];
291
        s += sq[pix1[15] - pix2[15]];
292

    
293
        pix1 += line_size;
294
        pix2 += line_size;
295
    }
296
    return s;
297
}
298

    
299
/* draw the edges of width 'w' of an image of size width, height */
300
//FIXME check that this is ok for mpeg4 interlaced
301
static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
302
{
303
    uint8_t *ptr, *last_line;
304
    int i;
305

    
306
    last_line = buf + (height - 1) * wrap;
307
    for(i=0;i<w;i++) {
308
        /* top and bottom */
309
        memcpy(buf - (i + 1) * wrap, buf, width);
310
        memcpy(last_line + (i + 1) * wrap, last_line, width);
311
    }
312
    /* left and right */
313
    ptr = buf;
314
    for(i=0;i<height;i++) {
315
        memset(ptr - w, ptr[0], w);
316
        memset(ptr + width, ptr[width-1], w);
317
        ptr += wrap;
318
    }
319
    /* corners */
320
    for(i=0;i<w;i++) {
321
        memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
322
        memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
323
        memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
324
        memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
325
    }
326
}
327

    
328
/**
329
 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
330
 * @param buf destination buffer
331
 * @param src source buffer
332
 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
333
 * @param block_w width of block
334
 * @param block_h height of block
335
 * @param src_x x coordinate of the top left sample of the block in the source buffer
336
 * @param src_y y coordinate of the top left sample of the block in the source buffer
337
 * @param w width of the source buffer
338
 * @param h height of the source buffer
339
 */
340
void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
341
                                    int src_x, int src_y, int w, int h){
342
    int x, y;
343
    int start_y, start_x, end_y, end_x;
344

    
345
    if(src_y>= h){
346
        src+= (h-1-src_y)*linesize;
347
        src_y=h-1;
348
    }else if(src_y<=-block_h){
349
        src+= (1-block_h-src_y)*linesize;
350
        src_y=1-block_h;
351
    }
352
    if(src_x>= w){
353
        src+= (w-1-src_x);
354
        src_x=w-1;
355
    }else if(src_x<=-block_w){
356
        src+= (1-block_w-src_x);
357
        src_x=1-block_w;
358
    }
359

    
360
    start_y= FFMAX(0, -src_y);
361
    start_x= FFMAX(0, -src_x);
362
    end_y= FFMIN(block_h, h-src_y);
363
    end_x= FFMIN(block_w, w-src_x);
364
    assert(start_y < end_y && block_h);
365
    assert(start_x < end_x && block_w);
366

    
367
    w    = end_x - start_x;
368
    src += start_y*linesize + start_x;
369
    buf += start_x;
370

    
371
    //top
372
    for(y=0; y<start_y; y++){
373
        memcpy(buf, src, w);
374
        buf += linesize;
375
    }
376

    
377
    // copy existing part
378
    for(; y<end_y; y++){
379
        memcpy(buf, src, w);
380
        src += linesize;
381
        buf += linesize;
382
    }
383

    
384
    //bottom
385
    src -= linesize;
386
    for(; y<block_h; y++){
387
        memcpy(buf, src, w);
388
        buf += linesize;
389
    }
390

    
391
    buf -= block_h * linesize + start_x;
392
    while (block_h--){
393
       //left
394
        for(x=0; x<start_x; x++){
395
            buf[x] = buf[start_x];
396
        }
397

    
398
       //right
399
        for(x=end_x; x<block_w; x++){
400
            buf[x] = buf[end_x - 1];
401
        }
402
        buf += linesize;
403
    }
404
}
405

    
406
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
407
{
408
    int i;
409

    
410
    /* read the pixels */
411
    for(i=0;i<8;i++) {
412
        block[0] = pixels[0];
413
        block[1] = pixels[1];
414
        block[2] = pixels[2];
415
        block[3] = pixels[3];
416
        block[4] = pixels[4];
417
        block[5] = pixels[5];
418
        block[6] = pixels[6];
419
        block[7] = pixels[7];
420
        pixels += line_size;
421
        block += 8;
422
    }
423
}
424

    
425
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
426
                          const uint8_t *s2, int stride){
427
    int i;
428

    
429
    /* read the pixels */
430
    for(i=0;i<8;i++) {
431
        block[0] = s1[0] - s2[0];
432
        block[1] = s1[1] - s2[1];
433
        block[2] = s1[2] - s2[2];
434
        block[3] = s1[3] - s2[3];
435
        block[4] = s1[4] - s2[4];
436
        block[5] = s1[5] - s2[5];
437
        block[6] = s1[6] - s2[6];
438
        block[7] = s1[7] - s2[7];
439
        s1 += stride;
440
        s2 += stride;
441
        block += 8;
442
    }
443
}
444

    
445

    
446
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
447
                             int line_size)
448
{
449
    int i;
450
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
451

    
452
    /* read the pixels */
453
    for(i=0;i<8;i++) {
454
        pixels[0] = cm[block[0]];
455
        pixels[1] = cm[block[1]];
456
        pixels[2] = cm[block[2]];
457
        pixels[3] = cm[block[3]];
458
        pixels[4] = cm[block[4]];
459
        pixels[5] = cm[block[5]];
460
        pixels[6] = cm[block[6]];
461
        pixels[7] = cm[block[7]];
462

    
463
        pixels += line_size;
464
        block += 8;
465
    }
466
}
467

    
468
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
469
                                 int line_size)
470
{
471
    int i;
472
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
473

    
474
    /* read the pixels */
475
    for(i=0;i<4;i++) {
476
        pixels[0] = cm[block[0]];
477
        pixels[1] = cm[block[1]];
478
        pixels[2] = cm[block[2]];
479
        pixels[3] = cm[block[3]];
480

    
481
        pixels += line_size;
482
        block += 8;
483
    }
484
}
485

    
486
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
487
                                 int line_size)
488
{
489
    int i;
490
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
491

    
492
    /* read the pixels */
493
    for(i=0;i<2;i++) {
494
        pixels[0] = cm[block[0]];
495
        pixels[1] = cm[block[1]];
496

    
497
        pixels += line_size;
498
        block += 8;
499
    }
500
}
501

    
502
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
503
                                    uint8_t *restrict pixels,
504
                                    int line_size)
505
{
506
    int i, j;
507

    
508
    for (i = 0; i < 8; i++) {
509
        for (j = 0; j < 8; j++) {
510
            if (*block < -128)
511
                *pixels = 0;
512
            else if (*block > 127)
513
                *pixels = 255;
514
            else
515
                *pixels = (uint8_t)(*block + 128);
516
            block++;
517
            pixels++;
518
        }
519
        pixels += (line_size - 8);
520
    }
521
}
522

    
523
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
524
                                    int line_size)
525
{
526
    int i;
527

    
528
    /* read the pixels */
529
    for(i=0;i<8;i++) {
530
        pixels[0] = block[0];
531
        pixels[1] = block[1];
532
        pixels[2] = block[2];
533
        pixels[3] = block[3];
534
        pixels[4] = block[4];
535
        pixels[5] = block[5];
536
        pixels[6] = block[6];
537
        pixels[7] = block[7];
538

    
539
        pixels += line_size;
540
        block += 8;
541
    }
542
}
543

    
544
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
545
                             int line_size)
546
{
547
    int i;
548
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
549

    
550
    /* read the pixels */
551
    for(i=0;i<8;i++) {
552
        pixels[0] = cm[pixels[0] + block[0]];
553
        pixels[1] = cm[pixels[1] + block[1]];
554
        pixels[2] = cm[pixels[2] + block[2]];
555
        pixels[3] = cm[pixels[3] + block[3]];
556
        pixels[4] = cm[pixels[4] + block[4]];
557
        pixels[5] = cm[pixels[5] + block[5]];
558
        pixels[6] = cm[pixels[6] + block[6]];
559
        pixels[7] = cm[pixels[7] + block[7]];
560
        pixels += line_size;
561
        block += 8;
562
    }
563
}
564

    
565
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
566
                          int line_size)
567
{
568
    int i;
569
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
570

    
571
    /* read the pixels */
572
    for(i=0;i<4;i++) {
573
        pixels[0] = cm[pixels[0] + block[0]];
574
        pixels[1] = cm[pixels[1] + block[1]];
575
        pixels[2] = cm[pixels[2] + block[2]];
576
        pixels[3] = cm[pixels[3] + block[3]];
577
        pixels += line_size;
578
        block += 8;
579
    }
580
}
581

    
582
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
583
                          int line_size)
584
{
585
    int i;
586
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
587

    
588
    /* read the pixels */
589
    for(i=0;i<2;i++) {
590
        pixels[0] = cm[pixels[0] + block[0]];
591
        pixels[1] = cm[pixels[1] + block[1]];
592
        pixels += line_size;
593
        block += 8;
594
    }
595
}
596

    
597
static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
598
{
599
    int i;
600
    for(i=0;i<8;i++) {
601
        pixels[0] += block[0];
602
        pixels[1] += block[1];
603
        pixels[2] += block[2];
604
        pixels[3] += block[3];
605
        pixels[4] += block[4];
606
        pixels[5] += block[5];
607
        pixels[6] += block[6];
608
        pixels[7] += block[7];
609
        pixels += line_size;
610
        block += 8;
611
    }
612
}
613

    
614
static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
615
{
616
    int i;
617
    for(i=0;i<4;i++) {
618
        pixels[0] += block[0];
619
        pixels[1] += block[1];
620
        pixels[2] += block[2];
621
        pixels[3] += block[3];
622
        pixels += line_size;
623
        block += 4;
624
    }
625
}
626

    
627
static int sum_abs_dctelem_c(DCTELEM *block)
628
{
629
    int sum=0, i;
630
    for(i=0; i<64; i++)
631
        sum+= FFABS(block[i]);
632
    return sum;
633
}
634

    
635
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
636
{
637
    int i;
638

    
639
    for (i = 0; i < h; i++) {
640
        memset(block, value, 16);
641
        block += line_size;
642
    }
643
}
644

    
645
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
646
{
647
    int i;
648

    
649
    for (i = 0; i < h; i++) {
650
        memset(block, value, 8);
651
        block += line_size;
652
    }
653
}
654

    
655
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
656
{
657
    int i, j;
658
    uint16_t *dst1 = (uint16_t *) dst;
659
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
660

    
661
    for (j = 0; j < 8; j++) {
662
        for (i = 0; i < 8; i++) {
663
            dst1[i] = dst2[i] = src[i] * 0x0101;
664
        }
665
        src  += 8;
666
        dst1 += linesize;
667
        dst2 += linesize;
668
    }
669
}
670

    
671
#if 0
672

673
#define PIXOP2(OPNAME, OP) \
674
static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
675
{\
676
    int i;\
677
    for(i=0; i<h; i++){\
678
        OP(*((uint64_t*)block), AV_RN64(pixels));\
679
        pixels+=line_size;\
680
        block +=line_size;\
681
    }\
682
}\
683
\
684
static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
685
{\
686
    int i;\
687
    for(i=0; i<h; i++){\
688
        const uint64_t a= AV_RN64(pixels  );\
689
        const uint64_t b= AV_RN64(pixels+1);\
690
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
691
        pixels+=line_size;\
692
        block +=line_size;\
693
    }\
694
}\
695
\
696
static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
697
{\
698
    int i;\
699
    for(i=0; i<h; i++){\
700
        const uint64_t a= AV_RN64(pixels  );\
701
        const uint64_t b= AV_RN64(pixels+1);\
702
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
703
        pixels+=line_size;\
704
        block +=line_size;\
705
    }\
706
}\
707
\
708
static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
709
{\
710
    int i;\
711
    for(i=0; i<h; i++){\
712
        const uint64_t a= AV_RN64(pixels          );\
713
        const uint64_t b= AV_RN64(pixels+line_size);\
714
        OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
715
        pixels+=line_size;\
716
        block +=line_size;\
717
    }\
718
}\
719
\
720
static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
721
{\
722
    int i;\
723
    for(i=0; i<h; i++){\
724
        const uint64_t a= AV_RN64(pixels          );\
725
        const uint64_t b= AV_RN64(pixels+line_size);\
726
        OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
727
        pixels+=line_size;\
728
        block +=line_size;\
729
    }\
730
}\
731
\
732
static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
733
{\
734
        int i;\
735
        const uint64_t a= AV_RN64(pixels  );\
736
        const uint64_t b= AV_RN64(pixels+1);\
737
        uint64_t l0=  (a&0x0303030303030303ULL)\
738
                    + (b&0x0303030303030303ULL)\
739
                    + 0x0202020202020202ULL;\
740
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
741
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
742
        uint64_t l1,h1;\
743
\
744
        pixels+=line_size;\
745
        for(i=0; i<h; i+=2){\
746
            uint64_t a= AV_RN64(pixels  );\
747
            uint64_t b= AV_RN64(pixels+1);\
748
            l1=  (a&0x0303030303030303ULL)\
749
               + (b&0x0303030303030303ULL);\
750
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
751
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
752
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
753
            pixels+=line_size;\
754
            block +=line_size;\
755
            a= AV_RN64(pixels  );\
756
            b= AV_RN64(pixels+1);\
757
            l0=  (a&0x0303030303030303ULL)\
758
               + (b&0x0303030303030303ULL)\
759
               + 0x0202020202020202ULL;\
760
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
761
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
762
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
763
            pixels+=line_size;\
764
            block +=line_size;\
765
        }\
766
}\
767
\
768
static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
769
{\
770
        int i;\
771
        const uint64_t a= AV_RN64(pixels  );\
772
        const uint64_t b= AV_RN64(pixels+1);\
773
        uint64_t l0=  (a&0x0303030303030303ULL)\
774
                    + (b&0x0303030303030303ULL)\
775
                    + 0x0101010101010101ULL;\
776
        uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
777
                   + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
778
        uint64_t l1,h1;\
779
\
780
        pixels+=line_size;\
781
        for(i=0; i<h; i+=2){\
782
            uint64_t a= AV_RN64(pixels  );\
783
            uint64_t b= AV_RN64(pixels+1);\
784
            l1=  (a&0x0303030303030303ULL)\
785
               + (b&0x0303030303030303ULL);\
786
            h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
787
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
788
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
789
            pixels+=line_size;\
790
            block +=line_size;\
791
            a= AV_RN64(pixels  );\
792
            b= AV_RN64(pixels+1);\
793
            l0=  (a&0x0303030303030303ULL)\
794
               + (b&0x0303030303030303ULL)\
795
               + 0x0101010101010101ULL;\
796
            h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
797
              + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
798
            OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
799
            pixels+=line_size;\
800
            block +=line_size;\
801
        }\
802
}\
803
\
804
CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
805
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
806
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
807
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
808
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
809
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
810
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
811

812
#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
813
#else // 64 bit variant
814

    
815
#define PIXOP2(OPNAME, OP) \
816
static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
817
    int i;\
818
    for(i=0; i<h; i++){\
819
        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
820
        pixels+=line_size;\
821
        block +=line_size;\
822
    }\
823
}\
824
static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
825
    int i;\
826
    for(i=0; i<h; i++){\
827
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
828
        pixels+=line_size;\
829
        block +=line_size;\
830
    }\
831
}\
832
static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
833
    int i;\
834
    for(i=0; i<h; i++){\
835
        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
836
        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
837
        pixels+=line_size;\
838
        block +=line_size;\
839
    }\
840
}\
841
static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842
    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
843
}\
844
\
845
static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
846
                                                int src_stride1, int src_stride2, int h){\
847
    int i;\
848
    for(i=0; i<h; i++){\
849
        uint32_t a,b;\
850
        a= AV_RN32(&src1[i*src_stride1  ]);\
851
        b= AV_RN32(&src2[i*src_stride2  ]);\
852
        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
853
        a= AV_RN32(&src1[i*src_stride1+4]);\
854
        b= AV_RN32(&src2[i*src_stride2+4]);\
855
        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
856
    }\
857
}\
858
\
859
static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
860
                                                int src_stride1, int src_stride2, int h){\
861
    int i;\
862
    for(i=0; i<h; i++){\
863
        uint32_t a,b;\
864
        a= AV_RN32(&src1[i*src_stride1  ]);\
865
        b= AV_RN32(&src2[i*src_stride2  ]);\
866
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
867
        a= AV_RN32(&src1[i*src_stride1+4]);\
868
        b= AV_RN32(&src2[i*src_stride2+4]);\
869
        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
870
    }\
871
}\
872
\
873
static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
874
                                                int src_stride1, int src_stride2, int h){\
875
    int i;\
876
    for(i=0; i<h; i++){\
877
        uint32_t a,b;\
878
        a= AV_RN32(&src1[i*src_stride1  ]);\
879
        b= AV_RN32(&src2[i*src_stride2  ]);\
880
        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
881
    }\
882
}\
883
\
884
static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885
                                                int src_stride1, int src_stride2, int h){\
886
    int i;\
887
    for(i=0; i<h; i++){\
888
        uint32_t a,b;\
889
        a= AV_RN16(&src1[i*src_stride1  ]);\
890
        b= AV_RN16(&src2[i*src_stride2  ]);\
891
        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
892
    }\
893
}\
894
\
895
static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
896
                                                int src_stride1, int src_stride2, int h){\
897
    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
898
    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
899
}\
900
\
901
static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
902
                                                int src_stride1, int src_stride2, int h){\
903
    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
904
    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
905
}\
906
\
907
static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
909
}\
910
\
911
static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
912
    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
913
}\
914
\
915
static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
916
    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
917
}\
918
\
919
static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
920
    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
921
}\
922
\
923
static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
924
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
925
    int i;\
926
    for(i=0; i<h; i++){\
927
        uint32_t a, b, c, d, l0, l1, h0, h1;\
928
        a= AV_RN32(&src1[i*src_stride1]);\
929
        b= AV_RN32(&src2[i*src_stride2]);\
930
        c= AV_RN32(&src3[i*src_stride3]);\
931
        d= AV_RN32(&src4[i*src_stride4]);\
932
        l0=  (a&0x03030303UL)\
933
           + (b&0x03030303UL)\
934
           + 0x02020202UL;\
935
        h0= ((a&0xFCFCFCFCUL)>>2)\
936
          + ((b&0xFCFCFCFCUL)>>2);\
937
        l1=  (c&0x03030303UL)\
938
           + (d&0x03030303UL);\
939
        h1= ((c&0xFCFCFCFCUL)>>2)\
940
          + ((d&0xFCFCFCFCUL)>>2);\
941
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
942
        a= AV_RN32(&src1[i*src_stride1+4]);\
943
        b= AV_RN32(&src2[i*src_stride2+4]);\
944
        c= AV_RN32(&src3[i*src_stride3+4]);\
945
        d= AV_RN32(&src4[i*src_stride4+4]);\
946
        l0=  (a&0x03030303UL)\
947
           + (b&0x03030303UL)\
948
           + 0x02020202UL;\
949
        h0= ((a&0xFCFCFCFCUL)>>2)\
950
          + ((b&0xFCFCFCFCUL)>>2);\
951
        l1=  (c&0x03030303UL)\
952
           + (d&0x03030303UL);\
953
        h1= ((c&0xFCFCFCFCUL)>>2)\
954
          + ((d&0xFCFCFCFCUL)>>2);\
955
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
956
    }\
957
}\
958
\
959
static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960
    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
961
}\
962
\
963
static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
964
    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
965
}\
966
\
967
static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
968
    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
969
}\
970
\
971
static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
972
    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
973
}\
974
\
975
static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
976
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
977
    int i;\
978
    for(i=0; i<h; i++){\
979
        uint32_t a, b, c, d, l0, l1, h0, h1;\
980
        a= AV_RN32(&src1[i*src_stride1]);\
981
        b= AV_RN32(&src2[i*src_stride2]);\
982
        c= AV_RN32(&src3[i*src_stride3]);\
983
        d= AV_RN32(&src4[i*src_stride4]);\
984
        l0=  (a&0x03030303UL)\
985
           + (b&0x03030303UL)\
986
           + 0x01010101UL;\
987
        h0= ((a&0xFCFCFCFCUL)>>2)\
988
          + ((b&0xFCFCFCFCUL)>>2);\
989
        l1=  (c&0x03030303UL)\
990
           + (d&0x03030303UL);\
991
        h1= ((c&0xFCFCFCFCUL)>>2)\
992
          + ((d&0xFCFCFCFCUL)>>2);\
993
        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
994
        a= AV_RN32(&src1[i*src_stride1+4]);\
995
        b= AV_RN32(&src2[i*src_stride2+4]);\
996
        c= AV_RN32(&src3[i*src_stride3+4]);\
997
        d= AV_RN32(&src4[i*src_stride4+4]);\
998
        l0=  (a&0x03030303UL)\
999
           + (b&0x03030303UL)\
1000
           + 0x01010101UL;\
1001
        h0= ((a&0xFCFCFCFCUL)>>2)\
1002
          + ((b&0xFCFCFCFCUL)>>2);\
1003
        l1=  (c&0x03030303UL)\
1004
           + (d&0x03030303UL);\
1005
        h1= ((c&0xFCFCFCFCUL)>>2)\
1006
          + ((d&0xFCFCFCFCUL)>>2);\
1007
        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008
    }\
1009
}\
1010
static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1011
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1012
    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1013
    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1014
}\
1015
static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1016
                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1017
    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018
    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1019
}\
1020
\
1021
static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1022
{\
1023
        int i, a0, b0, a1, b1;\
1024
        a0= pixels[0];\
1025
        b0= pixels[1] + 2;\
1026
        a0 += b0;\
1027
        b0 += pixels[2];\
1028
\
1029
        pixels+=line_size;\
1030
        for(i=0; i<h; i+=2){\
1031
            a1= pixels[0];\
1032
            b1= pixels[1];\
1033
            a1 += b1;\
1034
            b1 += pixels[2];\
1035
\
1036
            block[0]= (a1+a0)>>2; /* FIXME non put */\
1037
            block[1]= (b1+b0)>>2;\
1038
\
1039
            pixels+=line_size;\
1040
            block +=line_size;\
1041
\
1042
            a0= pixels[0];\
1043
            b0= pixels[1] + 2;\
1044
            a0 += b0;\
1045
            b0 += pixels[2];\
1046
\
1047
            block[0]= (a1+a0)>>2;\
1048
            block[1]= (b1+b0)>>2;\
1049
            pixels+=line_size;\
1050
            block +=line_size;\
1051
        }\
1052
}\
1053
\
1054
static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1055
{\
1056
        int i;\
1057
        const uint32_t a= AV_RN32(pixels  );\
1058
        const uint32_t b= AV_RN32(pixels+1);\
1059
        uint32_t l0=  (a&0x03030303UL)\
1060
                    + (b&0x03030303UL)\
1061
                    + 0x02020202UL;\
1062
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1063
                   + ((b&0xFCFCFCFCUL)>>2);\
1064
        uint32_t l1,h1;\
1065
\
1066
        pixels+=line_size;\
1067
        for(i=0; i<h; i+=2){\
1068
            uint32_t a= AV_RN32(pixels  );\
1069
            uint32_t b= AV_RN32(pixels+1);\
1070
            l1=  (a&0x03030303UL)\
1071
               + (b&0x03030303UL);\
1072
            h1= ((a&0xFCFCFCFCUL)>>2)\
1073
              + ((b&0xFCFCFCFCUL)>>2);\
1074
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1075
            pixels+=line_size;\
1076
            block +=line_size;\
1077
            a= AV_RN32(pixels  );\
1078
            b= AV_RN32(pixels+1);\
1079
            l0=  (a&0x03030303UL)\
1080
               + (b&0x03030303UL)\
1081
               + 0x02020202UL;\
1082
            h0= ((a&0xFCFCFCFCUL)>>2)\
1083
              + ((b&0xFCFCFCFCUL)>>2);\
1084
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1085
            pixels+=line_size;\
1086
            block +=line_size;\
1087
        }\
1088
}\
1089
\
1090
static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1091
{\
1092
    int j;\
1093
    for(j=0; j<2; j++){\
1094
        int i;\
1095
        const uint32_t a= AV_RN32(pixels  );\
1096
        const uint32_t b= AV_RN32(pixels+1);\
1097
        uint32_t l0=  (a&0x03030303UL)\
1098
                    + (b&0x03030303UL)\
1099
                    + 0x02020202UL;\
1100
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1101
                   + ((b&0xFCFCFCFCUL)>>2);\
1102
        uint32_t l1,h1;\
1103
\
1104
        pixels+=line_size;\
1105
        for(i=0; i<h; i+=2){\
1106
            uint32_t a= AV_RN32(pixels  );\
1107
            uint32_t b= AV_RN32(pixels+1);\
1108
            l1=  (a&0x03030303UL)\
1109
               + (b&0x03030303UL);\
1110
            h1= ((a&0xFCFCFCFCUL)>>2)\
1111
              + ((b&0xFCFCFCFCUL)>>2);\
1112
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1113
            pixels+=line_size;\
1114
            block +=line_size;\
1115
            a= AV_RN32(pixels  );\
1116
            b= AV_RN32(pixels+1);\
1117
            l0=  (a&0x03030303UL)\
1118
               + (b&0x03030303UL)\
1119
               + 0x02020202UL;\
1120
            h0= ((a&0xFCFCFCFCUL)>>2)\
1121
              + ((b&0xFCFCFCFCUL)>>2);\
1122
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1123
            pixels+=line_size;\
1124
            block +=line_size;\
1125
        }\
1126
        pixels+=4-line_size*(h+1);\
1127
        block +=4-line_size*h;\
1128
    }\
1129
}\
1130
\
1131
static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1132
{\
1133
    int j;\
1134
    for(j=0; j<2; j++){\
1135
        int i;\
1136
        const uint32_t a= AV_RN32(pixels  );\
1137
        const uint32_t b= AV_RN32(pixels+1);\
1138
        uint32_t l0=  (a&0x03030303UL)\
1139
                    + (b&0x03030303UL)\
1140
                    + 0x01010101UL;\
1141
        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142
                   + ((b&0xFCFCFCFCUL)>>2);\
1143
        uint32_t l1,h1;\
1144
\
1145
        pixels+=line_size;\
1146
        for(i=0; i<h; i+=2){\
1147
            uint32_t a= AV_RN32(pixels  );\
1148
            uint32_t b= AV_RN32(pixels+1);\
1149
            l1=  (a&0x03030303UL)\
1150
               + (b&0x03030303UL);\
1151
            h1= ((a&0xFCFCFCFCUL)>>2)\
1152
              + ((b&0xFCFCFCFCUL)>>2);\
1153
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154
            pixels+=line_size;\
1155
            block +=line_size;\
1156
            a= AV_RN32(pixels  );\
1157
            b= AV_RN32(pixels+1);\
1158
            l0=  (a&0x03030303UL)\
1159
               + (b&0x03030303UL)\
1160
               + 0x01010101UL;\
1161
            h0= ((a&0xFCFCFCFCUL)>>2)\
1162
              + ((b&0xFCFCFCFCUL)>>2);\
1163
            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164
            pixels+=line_size;\
1165
            block +=line_size;\
1166
        }\
1167
        pixels+=4-line_size*(h+1);\
1168
        block +=4-line_size*h;\
1169
    }\
1170
}\
1171
\
1172
CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1173
CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1174
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1175
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1176
av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1177
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1178
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1179
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1180

    
1181
#define op_avg(a, b) a = rnd_avg32(a, b)
1182
#endif
1183
#define op_put(a, b) a = b
1184

    
1185
PIXOP2(avg, op_avg)
1186
PIXOP2(put, op_put)
1187
#undef op_avg
1188
#undef op_put
1189

    
1190
#define put_no_rnd_pixels8_c  put_pixels8_c
1191
#define put_no_rnd_pixels16_c put_pixels16_c
1192

    
1193
#define avg2(a,b) ((a+b+1)>>1)
1194
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1195

    
1196
static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1197
    put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1198
}
1199

    
1200
static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1201
    put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1202
}
1203

    
1204
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1205
{
1206
    const int A=(16-x16)*(16-y16);
1207
    const int B=(   x16)*(16-y16);
1208
    const int C=(16-x16)*(   y16);
1209
    const int D=(   x16)*(   y16);
1210
    int i;
1211

    
1212
    for(i=0; i<h; i++)
1213
    {
1214
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1215
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1216
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1217
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1218
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1219
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1220
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1221
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1222
        dst+= stride;
1223
        src+= stride;
1224
    }
1225
}
1226

    
1227
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1228
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1229
{
1230
    int y, vx, vy;
1231
    const int s= 1<<shift;
1232

    
1233
    width--;
1234
    height--;
1235

    
1236
    for(y=0; y<h; y++){
1237
        int x;
1238

    
1239
        vx= ox;
1240
        vy= oy;
1241
        for(x=0; x<8; x++){ //XXX FIXME optimize
1242
            int src_x, src_y, frac_x, frac_y, index;
1243

    
1244
            src_x= vx>>16;
1245
            src_y= vy>>16;
1246
            frac_x= src_x&(s-1);
1247
            frac_y= src_y&(s-1);
1248
            src_x>>=shift;
1249
            src_y>>=shift;
1250

    
1251
            if((unsigned)src_x < width){
1252
                if((unsigned)src_y < height){
1253
                    index= src_x + src_y*stride;
1254
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1255
                                           + src[index       +1]*   frac_x )*(s-frac_y)
1256
                                        + (  src[index+stride  ]*(s-frac_x)
1257
                                           + src[index+stride+1]*   frac_x )*   frac_y
1258
                                        + r)>>(shift*2);
1259
                }else{
1260
                    index= src_x + av_clip(src_y, 0, height)*stride;
1261
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1262
                                          + src[index       +1]*   frac_x )*s
1263
                                        + r)>>(shift*2);
1264
                }
1265
            }else{
1266
                if((unsigned)src_y < height){
1267
                    index= av_clip(src_x, 0, width) + src_y*stride;
1268
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1269
                                           + src[index+stride  ]*   frac_y )*s
1270
                                        + r)>>(shift*2);
1271
                }else{
1272
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1273
                    dst[y*stride + x]=    src[index         ];
1274
                }
1275
            }
1276

    
1277
            vx+= dxx;
1278
            vy+= dyx;
1279
        }
1280
        ox += dxy;
1281
        oy += dyy;
1282
    }
1283
}
1284

    
1285
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1286
    switch(width){
1287
    case 2: put_pixels2_c (dst, src, stride, height); break;
1288
    case 4: put_pixels4_c (dst, src, stride, height); break;
1289
    case 8: put_pixels8_c (dst, src, stride, height); break;
1290
    case 16:put_pixels16_c(dst, src, stride, height); break;
1291
    }
1292
}
1293

    
1294
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295
    int i,j;
1296
    for (i=0; i < height; i++) {
1297
      for (j=0; j < width; j++) {
1298
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1299
      }
1300
      src += stride;
1301
      dst += stride;
1302
    }
1303
}
1304

    
1305
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306
    int i,j;
1307
    for (i=0; i < height; i++) {
1308
      for (j=0; j < width; j++) {
1309
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1310
      }
1311
      src += stride;
1312
      dst += stride;
1313
    }
1314
}
1315

    
1316
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317
    int i,j;
1318
    for (i=0; i < height; i++) {
1319
      for (j=0; j < width; j++) {
1320
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1321
      }
1322
      src += stride;
1323
      dst += stride;
1324
    }
1325
}
1326

    
1327
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328
    int i,j;
1329
    for (i=0; i < height; i++) {
1330
      for (j=0; j < width; j++) {
1331
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1332
      }
1333
      src += stride;
1334
      dst += stride;
1335
    }
1336
}
1337

    
1338
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339
    int i,j;
1340
    for (i=0; i < height; i++) {
1341
      for (j=0; j < width; j++) {
1342
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1343
      }
1344
      src += stride;
1345
      dst += stride;
1346
    }
1347
}
1348

    
1349
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350
    int i,j;
1351
    for (i=0; i < height; i++) {
1352
      for (j=0; j < width; j++) {
1353
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1354
      }
1355
      src += stride;
1356
      dst += stride;
1357
    }
1358
}
1359

    
1360
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361
    int i,j;
1362
    for (i=0; i < height; i++) {
1363
      for (j=0; j < width; j++) {
1364
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365
      }
1366
      src += stride;
1367
      dst += stride;
1368
    }
1369
}
1370

    
1371
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372
    int i,j;
1373
    for (i=0; i < height; i++) {
1374
      for (j=0; j < width; j++) {
1375
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1376
      }
1377
      src += stride;
1378
      dst += stride;
1379
    }
1380
}
1381

    
1382
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383
    switch(width){
1384
    case 2: avg_pixels2_c (dst, src, stride, height); break;
1385
    case 4: avg_pixels4_c (dst, src, stride, height); break;
1386
    case 8: avg_pixels8_c (dst, src, stride, height); break;
1387
    case 16:avg_pixels16_c(dst, src, stride, height); break;
1388
    }
1389
}
1390

    
1391
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392
    int i,j;
1393
    for (i=0; i < height; i++) {
1394
      for (j=0; j < width; j++) {
1395
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1396
      }
1397
      src += stride;
1398
      dst += stride;
1399
    }
1400
}
1401

    
1402
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403
    int i,j;
1404
    for (i=0; i < height; i++) {
1405
      for (j=0; j < width; j++) {
1406
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1407
      }
1408
      src += stride;
1409
      dst += stride;
1410
    }
1411
}
1412

    
1413
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414
    int i,j;
1415
    for (i=0; i < height; i++) {
1416
      for (j=0; j < width; j++) {
1417
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1418
      }
1419
      src += stride;
1420
      dst += stride;
1421
    }
1422
}
1423

    
1424
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425
    int i,j;
1426
    for (i=0; i < height; i++) {
1427
      for (j=0; j < width; j++) {
1428
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1429
      }
1430
      src += stride;
1431
      dst += stride;
1432
    }
1433
}
1434

    
1435
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436
    int i,j;
1437
    for (i=0; i < height; i++) {
1438
      for (j=0; j < width; j++) {
1439
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1440
      }
1441
      src += stride;
1442
      dst += stride;
1443
    }
1444
}
1445

    
1446
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447
    int i,j;
1448
    for (i=0; i < height; i++) {
1449
      for (j=0; j < width; j++) {
1450
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1451
      }
1452
      src += stride;
1453
      dst += stride;
1454
    }
1455
}
1456

    
1457
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458
    int i,j;
1459
    for (i=0; i < height; i++) {
1460
      for (j=0; j < width; j++) {
1461
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462
      }
1463
      src += stride;
1464
      dst += stride;
1465
    }
1466
}
1467

    
1468
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469
    int i,j;
1470
    for (i=0; i < height; i++) {
1471
      for (j=0; j < width; j++) {
1472
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1473
      }
1474
      src += stride;
1475
      dst += stride;
1476
    }
1477
}
1478
#if 0
1479
#define TPEL_WIDTH(width)\
1480
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1482
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1484
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1486
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1488
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1490
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1492
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1493
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1494
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1495
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1496
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1497
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1498
#endif
1499

    
1500
#define H264_CHROMA_MC(OPNAME, OP)\
1501
static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1502
    const int A=(8-x)*(8-y);\
1503
    const int B=(  x)*(8-y);\
1504
    const int C=(8-x)*(  y);\
1505
    const int D=(  x)*(  y);\
1506
    int i;\
1507
    \
1508
    assert(x<8 && y<8 && x>=0 && y>=0);\
1509
\
1510
    if(D){\
1511
        for(i=0; i<h; i++){\
1512
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1513
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1514
            dst+= stride;\
1515
            src+= stride;\
1516
        }\
1517
    }else{\
1518
        const int E= B+C;\
1519
        const int step= C ? stride : 1;\
1520
        for(i=0; i<h; i++){\
1521
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1522
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1523
            dst+= stride;\
1524
            src+= stride;\
1525
        }\
1526
    }\
1527
}\
1528
\
1529
static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1530
    const int A=(8-x)*(8-y);\
1531
    const int B=(  x)*(8-y);\
1532
    const int C=(8-x)*(  y);\
1533
    const int D=(  x)*(  y);\
1534
    int i;\
1535
    \
1536
    assert(x<8 && y<8 && x>=0 && y>=0);\
1537
\
1538
    if(D){\
1539
        for(i=0; i<h; i++){\
1540
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1541
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1542
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1543
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1544
            dst+= stride;\
1545
            src+= stride;\
1546
        }\
1547
    }else{\
1548
        const int E= B+C;\
1549
        const int step= C ? stride : 1;\
1550
        for(i=0; i<h; i++){\
1551
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1552
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1553
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1554
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1555
            dst+= stride;\
1556
            src+= stride;\
1557
        }\
1558
    }\
1559
}\
1560
\
1561
static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1562
    const int A=(8-x)*(8-y);\
1563
    const int B=(  x)*(8-y);\
1564
    const int C=(8-x)*(  y);\
1565
    const int D=(  x)*(  y);\
1566
    int i;\
1567
    \
1568
    assert(x<8 && y<8 && x>=0 && y>=0);\
1569
\
1570
    if(D){\
1571
        for(i=0; i<h; i++){\
1572
            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1573
            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1574
            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1575
            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1576
            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1577
            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1578
            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1579
            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1580
            dst+= stride;\
1581
            src+= stride;\
1582
        }\
1583
    }else{\
1584
        const int E= B+C;\
1585
        const int step= C ? stride : 1;\
1586
        for(i=0; i<h; i++){\
1587
            OP(dst[0], (A*src[0] + E*src[step+0]));\
1588
            OP(dst[1], (A*src[1] + E*src[step+1]));\
1589
            OP(dst[2], (A*src[2] + E*src[step+2]));\
1590
            OP(dst[3], (A*src[3] + E*src[step+3]));\
1591
            OP(dst[4], (A*src[4] + E*src[step+4]));\
1592
            OP(dst[5], (A*src[5] + E*src[step+5]));\
1593
            OP(dst[6], (A*src[6] + E*src[step+6]));\
1594
            OP(dst[7], (A*src[7] + E*src[step+7]));\
1595
            dst+= stride;\
1596
            src+= stride;\
1597
        }\
1598
    }\
1599
}
1600

    
1601
#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1602
#define op_put(a, b) a = (((b) + 32)>>6)
1603

    
1604
H264_CHROMA_MC(put_       , op_put)
1605
H264_CHROMA_MC(avg_       , op_avg)
1606
#undef op_avg
1607
#undef op_put
1608

    
1609
#define QPEL_MC(r, OPNAME, RND, OP) \
1610
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1611
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1612
    int i;\
1613
    for(i=0; i<h; i++)\
1614
    {\
1615
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1616
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1617
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1618
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1619
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1620
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1621
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1622
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1623
        dst+=dstStride;\
1624
        src+=srcStride;\
1625
    }\
1626
}\
1627
\
1628
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1629
    const int w=8;\
1630
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1631
    int i;\
1632
    for(i=0; i<w; i++)\
1633
    {\
1634
        const int src0= src[0*srcStride];\
1635
        const int src1= src[1*srcStride];\
1636
        const int src2= src[2*srcStride];\
1637
        const int src3= src[3*srcStride];\
1638
        const int src4= src[4*srcStride];\
1639
        const int src5= src[5*srcStride];\
1640
        const int src6= src[6*srcStride];\
1641
        const int src7= src[7*srcStride];\
1642
        const int src8= src[8*srcStride];\
1643
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1644
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1645
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1646
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1647
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1648
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1649
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1650
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1651
        dst++;\
1652
        src++;\
1653
    }\
1654
}\
1655
\
1656
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1657
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1658
    int i;\
1659
    \
1660
    for(i=0; i<h; i++)\
1661
    {\
1662
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1663
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1664
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1665
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1666
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1667
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1668
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1669
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1670
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1671
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1672
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1673
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1674
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1675
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1676
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1677
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1678
        dst+=dstStride;\
1679
        src+=srcStride;\
1680
    }\
1681
}\
1682
\
1683
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1684
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1685
    int i;\
1686
    const int w=16;\
1687
    for(i=0; i<w; i++)\
1688
    {\
1689
        const int src0= src[0*srcStride];\
1690
        const int src1= src[1*srcStride];\
1691
        const int src2= src[2*srcStride];\
1692
        const int src3= src[3*srcStride];\
1693
        const int src4= src[4*srcStride];\
1694
        const int src5= src[5*srcStride];\
1695
        const int src6= src[6*srcStride];\
1696
        const int src7= src[7*srcStride];\
1697
        const int src8= src[8*srcStride];\
1698
        const int src9= src[9*srcStride];\
1699
        const int src10= src[10*srcStride];\
1700
        const int src11= src[11*srcStride];\
1701
        const int src12= src[12*srcStride];\
1702
        const int src13= src[13*srcStride];\
1703
        const int src14= src[14*srcStride];\
1704
        const int src15= src[15*srcStride];\
1705
        const int src16= src[16*srcStride];\
1706
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1707
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1708
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1709
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1710
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1711
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1712
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1713
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1714
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1715
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1716
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1717
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1718
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1719
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1720
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1721
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1722
        dst++;\
1723
        src++;\
1724
    }\
1725
}\
1726
\
1727
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1728
    uint8_t half[64];\
1729
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1730
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1731
}\
1732
\
1733
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1734
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1735
}\
1736
\
1737
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1738
    uint8_t half[64];\
1739
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1740
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1741
}\
1742
\
1743
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1744
    uint8_t full[16*9];\
1745
    uint8_t half[64];\
1746
    copy_block9(full, src, 16, stride, 9);\
1747
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1748
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1749
}\
1750
\
1751
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1752
    uint8_t full[16*9];\
1753
    copy_block9(full, src, 16, stride, 9);\
1754
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1755
}\
1756
\
1757
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1758
    uint8_t full[16*9];\
1759
    uint8_t half[64];\
1760
    copy_block9(full, src, 16, stride, 9);\
1761
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1762
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1763
}\
1764
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1765
    uint8_t full[16*9];\
1766
    uint8_t halfH[72];\
1767
    uint8_t halfV[64];\
1768
    uint8_t halfHV[64];\
1769
    copy_block9(full, src, 16, stride, 9);\
1770
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1772
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1774
}\
1775
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1776
    uint8_t full[16*9];\
1777
    uint8_t halfH[72];\
1778
    uint8_t halfHV[64];\
1779
    copy_block9(full, src, 16, stride, 9);\
1780
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1781
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1782
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1783
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1784
}\
1785
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1786
    uint8_t full[16*9];\
1787
    uint8_t halfH[72];\
1788
    uint8_t halfV[64];\
1789
    uint8_t halfHV[64];\
1790
    copy_block9(full, src, 16, stride, 9);\
1791
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1793
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1794
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1795
}\
1796
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1797
    uint8_t full[16*9];\
1798
    uint8_t halfH[72];\
1799
    uint8_t halfHV[64];\
1800
    copy_block9(full, src, 16, stride, 9);\
1801
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1802
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1803
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1805
}\
1806
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1807
    uint8_t full[16*9];\
1808
    uint8_t halfH[72];\
1809
    uint8_t halfV[64];\
1810
    uint8_t halfHV[64];\
1811
    copy_block9(full, src, 16, stride, 9);\
1812
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1814
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1816
}\
1817
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1818
    uint8_t full[16*9];\
1819
    uint8_t halfH[72];\
1820
    uint8_t halfHV[64];\
1821
    copy_block9(full, src, 16, stride, 9);\
1822
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1823
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1824
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1825
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1826
}\
1827
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1828
    uint8_t full[16*9];\
1829
    uint8_t halfH[72];\
1830
    uint8_t halfV[64];\
1831
    uint8_t halfHV[64];\
1832
    copy_block9(full, src, 16, stride, 9);\
1833
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1834
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1835
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1836
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1837
}\
1838
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1839
    uint8_t full[16*9];\
1840
    uint8_t halfH[72];\
1841
    uint8_t halfHV[64];\
1842
    copy_block9(full, src, 16, stride, 9);\
1843
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1844
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1845
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1846
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1847
}\
1848
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1849
    uint8_t halfH[72];\
1850
    uint8_t halfHV[64];\
1851
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1852
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1854
}\
1855
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1856
    uint8_t halfH[72];\
1857
    uint8_t halfHV[64];\
1858
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1859
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1860
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1861
}\
1862
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863
    uint8_t full[16*9];\
1864
    uint8_t halfH[72];\
1865
    uint8_t halfV[64];\
1866
    uint8_t halfHV[64];\
1867
    copy_block9(full, src, 16, stride, 9);\
1868
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1869
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1870
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1871
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1872
}\
1873
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1874
    uint8_t full[16*9];\
1875
    uint8_t halfH[72];\
1876
    copy_block9(full, src, 16, stride, 9);\
1877
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1878
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1879
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1880
}\
1881
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882
    uint8_t full[16*9];\
1883
    uint8_t halfH[72];\
1884
    uint8_t halfV[64];\
1885
    uint8_t halfHV[64];\
1886
    copy_block9(full, src, 16, stride, 9);\
1887
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1888
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1889
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1891
}\
1892
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1893
    uint8_t full[16*9];\
1894
    uint8_t halfH[72];\
1895
    copy_block9(full, src, 16, stride, 9);\
1896
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1898
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1899
}\
1900
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1901
    uint8_t halfH[72];\
1902
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1903
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1904
}\
1905
\
1906
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1907
    uint8_t half[256];\
1908
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1909
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1910
}\
1911
\
1912
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1913
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1914
}\
1915
\
1916
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1917
    uint8_t half[256];\
1918
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1919
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1920
}\
1921
\
1922
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1923
    uint8_t full[24*17];\
1924
    uint8_t half[256];\
1925
    copy_block17(full, src, 24, stride, 17);\
1926
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1927
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1928
}\
1929
\
1930
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1931
    uint8_t full[24*17];\
1932
    copy_block17(full, src, 24, stride, 17);\
1933
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1934
}\
1935
\
1936
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1937
    uint8_t full[24*17];\
1938
    uint8_t half[256];\
1939
    copy_block17(full, src, 24, stride, 17);\
1940
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1941
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1942
}\
1943
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1944
    uint8_t full[24*17];\
1945
    uint8_t halfH[272];\
1946
    uint8_t halfV[256];\
1947
    uint8_t halfHV[256];\
1948
    copy_block17(full, src, 24, stride, 17);\
1949
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1950
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1951
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1953
}\
1954
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1955
    uint8_t full[24*17];\
1956
    uint8_t halfH[272];\
1957
    uint8_t halfHV[256];\
1958
    copy_block17(full, src, 24, stride, 17);\
1959
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1960
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1961
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1963
}\
1964
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965
    uint8_t full[24*17];\
1966
    uint8_t halfH[272];\
1967
    uint8_t halfV[256];\
1968
    uint8_t halfHV[256];\
1969
    copy_block17(full, src, 24, stride, 17);\
1970
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1972
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1974
}\
1975
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1976
    uint8_t full[24*17];\
1977
    uint8_t halfH[272];\
1978
    uint8_t halfHV[256];\
1979
    copy_block17(full, src, 24, stride, 17);\
1980
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1982
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1984
}\
1985
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1986
    uint8_t full[24*17];\
1987
    uint8_t halfH[272];\
1988
    uint8_t halfV[256];\
1989
    uint8_t halfHV[256];\
1990
    copy_block17(full, src, 24, stride, 17);\
1991
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1992
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1993
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1995
}\
1996
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1997
    uint8_t full[24*17];\
1998
    uint8_t halfH[272];\
1999
    uint8_t halfHV[256];\
2000
    copy_block17(full, src, 24, stride, 17);\
2001
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2003
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2005
}\
2006
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007
    uint8_t full[24*17];\
2008
    uint8_t halfH[272];\
2009
    uint8_t halfV[256];\
2010
    uint8_t halfHV[256];\
2011
    copy_block17(full, src, 24, stride, 17);\
2012
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2013
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2014
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2016
}\
2017
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2018
    uint8_t full[24*17];\
2019
    uint8_t halfH[272];\
2020
    uint8_t halfHV[256];\
2021
    copy_block17(full, src, 24, stride, 17);\
2022
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2024
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2025
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2026
}\
2027
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2028
    uint8_t halfH[272];\
2029
    uint8_t halfHV[256];\
2030
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2031
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2032
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2033
}\
2034
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2035
    uint8_t halfH[272];\
2036
    uint8_t halfHV[256];\
2037
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2038
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2040
}\
2041
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042
    uint8_t full[24*17];\
2043
    uint8_t halfH[272];\
2044
    uint8_t halfV[256];\
2045
    uint8_t halfHV[256];\
2046
    copy_block17(full, src, 24, stride, 17);\
2047
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2049
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2051
}\
2052
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2053
    uint8_t full[24*17];\
2054
    uint8_t halfH[272];\
2055
    copy_block17(full, src, 24, stride, 17);\
2056
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2057
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2058
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2059
}\
2060
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2061
    uint8_t full[24*17];\
2062
    uint8_t halfH[272];\
2063
    uint8_t halfV[256];\
2064
    uint8_t halfHV[256];\
2065
    copy_block17(full, src, 24, stride, 17);\
2066
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2067
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2068
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2069
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2070
}\
2071
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2072
    uint8_t full[24*17];\
2073
    uint8_t halfH[272];\
2074
    copy_block17(full, src, 24, stride, 17);\
2075
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2076
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2077
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2078
}\
2079
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2080
    uint8_t halfH[272];\
2081
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2082
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2083
}
2084

    
2085
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2086
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2087
#define op_put(a, b) a = cm[((b) + 16)>>5]
2088
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2089

    
2090
QPEL_MC(0, put_       , _       , op_put)
2091
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2092
QPEL_MC(0, avg_       , _       , op_avg)
2093
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
2094
#undef op_avg
2095
#undef op_avg_no_rnd
2096
#undef op_put
2097
#undef op_put_no_rnd
2098

    
2099
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
2100
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
2101
#define put_qpel16_mc00_c ff_put_pixels16x16_c
2102
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2103
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
2104
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2105

    
2106
#if 1
2107
#define H264_LOWPASS(OPNAME, OP, OP2) \
2108
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2109
    const int h=2;\
2110
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2111
    int i;\
2112
    for(i=0; i<h; i++)\
2113
    {\
2114
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2115
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2116
        dst+=dstStride;\
2117
        src+=srcStride;\
2118
    }\
2119
}\
2120
\
2121
static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2122
    const int w=2;\
2123
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2124
    int i;\
2125
    for(i=0; i<w; i++)\
2126
    {\
2127
        const int srcB= src[-2*srcStride];\
2128
        const int srcA= src[-1*srcStride];\
2129
        const int src0= src[0 *srcStride];\
2130
        const int src1= src[1 *srcStride];\
2131
        const int src2= src[2 *srcStride];\
2132
        const int src3= src[3 *srcStride];\
2133
        const int src4= src[4 *srcStride];\
2134
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2136
        dst++;\
2137
        src++;\
2138
    }\
2139
}\
2140
\
2141
static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2142
    const int h=2;\
2143
    const int w=2;\
2144
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2145
    int i;\
2146
    src -= 2*srcStride;\
2147
    for(i=0; i<h+5; i++)\
2148
    {\
2149
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2150
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2151
        tmp+=tmpStride;\
2152
        src+=srcStride;\
2153
    }\
2154
    tmp -= tmpStride*(h+5-2);\
2155
    for(i=0; i<w; i++)\
2156
    {\
2157
        const int tmpB= tmp[-2*tmpStride];\
2158
        const int tmpA= tmp[-1*tmpStride];\
2159
        const int tmp0= tmp[0 *tmpStride];\
2160
        const int tmp1= tmp[1 *tmpStride];\
2161
        const int tmp2= tmp[2 *tmpStride];\
2162
        const int tmp3= tmp[3 *tmpStride];\
2163
        const int tmp4= tmp[4 *tmpStride];\
2164
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2165
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2166
        dst++;\
2167
        tmp++;\
2168
    }\
2169
}\
2170
static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2171
    const int h=4;\
2172
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2173
    int i;\
2174
    for(i=0; i<h; i++)\
2175
    {\
2176
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2177
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2178
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2179
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2180
        dst+=dstStride;\
2181
        src+=srcStride;\
2182
    }\
2183
}\
2184
\
2185
static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2186
    const int w=4;\
2187
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2188
    int i;\
2189
    for(i=0; i<w; i++)\
2190
    {\
2191
        const int srcB= src[-2*srcStride];\
2192
        const int srcA= src[-1*srcStride];\
2193
        const int src0= src[0 *srcStride];\
2194
        const int src1= src[1 *srcStride];\
2195
        const int src2= src[2 *srcStride];\
2196
        const int src3= src[3 *srcStride];\
2197
        const int src4= src[4 *srcStride];\
2198
        const int src5= src[5 *srcStride];\
2199
        const int src6= src[6 *srcStride];\
2200
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2201
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2202
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2203
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2204
        dst++;\
2205
        src++;\
2206
    }\
2207
}\
2208
\
2209
static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2210
    const int h=4;\
2211
    const int w=4;\
2212
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2213
    int i;\
2214
    src -= 2*srcStride;\
2215
    for(i=0; i<h+5; i++)\
2216
    {\
2217
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2218
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2219
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2220
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2221
        tmp+=tmpStride;\
2222
        src+=srcStride;\
2223
    }\
2224
    tmp -= tmpStride*(h+5-2);\
2225
    for(i=0; i<w; i++)\
2226
    {\
2227
        const int tmpB= tmp[-2*tmpStride];\
2228
        const int tmpA= tmp[-1*tmpStride];\
2229
        const int tmp0= tmp[0 *tmpStride];\
2230
        const int tmp1= tmp[1 *tmpStride];\
2231
        const int tmp2= tmp[2 *tmpStride];\
2232
        const int tmp3= tmp[3 *tmpStride];\
2233
        const int tmp4= tmp[4 *tmpStride];\
2234
        const int tmp5= tmp[5 *tmpStride];\
2235
        const int tmp6= tmp[6 *tmpStride];\
2236
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2237
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2238
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2239
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2240
        dst++;\
2241
        tmp++;\
2242
    }\
2243
}\
2244
\
2245
static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2246
    const int h=8;\
2247
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248
    int i;\
2249
    for(i=0; i<h; i++)\
2250
    {\
2251
        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2252
        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2253
        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2254
        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2255
        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2256
        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2257
        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2258
        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2259
        dst+=dstStride;\
2260
        src+=srcStride;\
2261
    }\
2262
}\
2263
\
2264
static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2265
    const int w=8;\
2266
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2267
    int i;\
2268
    for(i=0; i<w; i++)\
2269
    {\
2270
        const int srcB= src[-2*srcStride];\
2271
        const int srcA= src[-1*srcStride];\
2272
        const int src0= src[0 *srcStride];\
2273
        const int src1= src[1 *srcStride];\
2274
        const int src2= src[2 *srcStride];\
2275
        const int src3= src[3 *srcStride];\
2276
        const int src4= src[4 *srcStride];\
2277
        const int src5= src[5 *srcStride];\
2278
        const int src6= src[6 *srcStride];\
2279
        const int src7= src[7 *srcStride];\
2280
        const int src8= src[8 *srcStride];\
2281
        const int src9= src[9 *srcStride];\
2282
        const int src10=src[10*srcStride];\
2283
        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2284
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2285
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2286
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2287
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2288
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2289
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2290
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2291
        dst++;\
2292
        src++;\
2293
    }\
2294
}\
2295
\
2296
static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2297
    const int h=8;\
2298
    const int w=8;\
2299
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300
    int i;\
2301
    src -= 2*srcStride;\
2302
    for(i=0; i<h+5; i++)\
2303
    {\
2304
        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2305
        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2306
        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2307
        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2308
        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2309
        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2310
        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2311
        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2312
        tmp+=tmpStride;\
2313
        src+=srcStride;\
2314
    }\
2315
    tmp -= tmpStride*(h+5-2);\
2316
    for(i=0; i<w; i++)\
2317
    {\
2318
        const int tmpB= tmp[-2*tmpStride];\
2319
        const int tmpA= tmp[-1*tmpStride];\
2320
        const int tmp0= tmp[0 *tmpStride];\
2321
        const int tmp1= tmp[1 *tmpStride];\
2322
        const int tmp2= tmp[2 *tmpStride];\
2323
        const int tmp3= tmp[3 *tmpStride];\
2324
        const int tmp4= tmp[4 *tmpStride];\
2325
        const int tmp5= tmp[5 *tmpStride];\
2326
        const int tmp6= tmp[6 *tmpStride];\
2327
        const int tmp7= tmp[7 *tmpStride];\
2328
        const int tmp8= tmp[8 *tmpStride];\
2329
        const int tmp9= tmp[9 *tmpStride];\
2330
        const int tmp10=tmp[10*tmpStride];\
2331
        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2332
        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2333
        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2334
        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2335
        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2336
        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2337
        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2338
        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2339
        dst++;\
2340
        tmp++;\
2341
    }\
2342
}\
2343
\
2344
static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2345
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2346
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2347
    src += 8*srcStride;\
2348
    dst += 8*dstStride;\
2349
    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2350
    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2351
}\
2352
\
2353
static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2354
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2355
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2356
    src += 8*srcStride;\
2357
    dst += 8*dstStride;\
2358
    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2359
    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2360
}\
2361
\
2362
static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2363
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2364
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2365
    src += 8*srcStride;\
2366
    dst += 8*dstStride;\
2367
    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2368
    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2369
}\
2370

    
2371
#define H264_MC(OPNAME, SIZE) \
2372
static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2373
    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2374
}\
2375
\
2376
static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2377
    uint8_t half[SIZE*SIZE];\
2378
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2379
    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2380
}\
2381
\
2382
static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2383
    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2384
}\
2385
\
2386
static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2387
    uint8_t half[SIZE*SIZE];\
2388
    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2389
    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2390
}\
2391
\
2392
static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2393
    uint8_t full[SIZE*(SIZE+5)];\
2394
    uint8_t * const full_mid= full + SIZE*2;\
2395
    uint8_t half[SIZE*SIZE];\
2396
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2397
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2398
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2399
}\
2400
\
2401
static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2402
    uint8_t full[SIZE*(SIZE+5)];\
2403
    uint8_t * const full_mid= full + SIZE*2;\
2404
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2405
    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2406
}\
2407
\
2408
static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2409
    uint8_t full[SIZE*(SIZE+5)];\
2410
    uint8_t * const full_mid= full + SIZE*2;\
2411
    uint8_t half[SIZE*SIZE];\
2412
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2413
    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2414
    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2415
}\
2416
\
2417
static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2418
    uint8_t full[SIZE*(SIZE+5)];\
2419
    uint8_t * const full_mid= full + SIZE*2;\
2420
    uint8_t halfH[SIZE*SIZE];\
2421
    uint8_t halfV[SIZE*SIZE];\
2422
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2423
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2424
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2425
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2426
}\
2427
\
2428
static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2429
    uint8_t full[SIZE*(SIZE+5)];\
2430
    uint8_t * const full_mid= full + SIZE*2;\
2431
    uint8_t halfH[SIZE*SIZE];\
2432
    uint8_t halfV[SIZE*SIZE];\
2433
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2434
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2435
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2436
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2437
}\
2438
\
2439
static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2440
    uint8_t full[SIZE*(SIZE+5)];\
2441
    uint8_t * const full_mid= full + SIZE*2;\
2442
    uint8_t halfH[SIZE*SIZE];\
2443
    uint8_t halfV[SIZE*SIZE];\
2444
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2445
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2448
}\
2449
\
2450
static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2451
    uint8_t full[SIZE*(SIZE+5)];\
2452
    uint8_t * const full_mid= full + SIZE*2;\
2453
    uint8_t halfH[SIZE*SIZE];\
2454
    uint8_t halfV[SIZE*SIZE];\
2455
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2456
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2457
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2459
}\
2460
\
2461
static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2462
    int16_t tmp[SIZE*(SIZE+5)];\
2463
    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2464
}\
2465
\
2466
static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2467
    int16_t tmp[SIZE*(SIZE+5)];\
2468
    uint8_t halfH[SIZE*SIZE];\
2469
    uint8_t halfHV[SIZE*SIZE];\
2470
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2471
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2472
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2473
}\
2474
\
2475
static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2476
    int16_t tmp[SIZE*(SIZE+5)];\
2477
    uint8_t halfH[SIZE*SIZE];\
2478
    uint8_t halfHV[SIZE*SIZE];\
2479
    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2480
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2481
    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2482
}\
2483
\
2484
static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2485
    uint8_t full[SIZE*(SIZE+5)];\
2486
    uint8_t * const full_mid= full + SIZE*2;\
2487
    int16_t tmp[SIZE*(SIZE+5)];\
2488
    uint8_t halfV[SIZE*SIZE];\
2489
    uint8_t halfHV[SIZE*SIZE];\
2490
    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2491
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2492
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2493
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2494
}\
2495
\
2496
static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2497
    uint8_t full[SIZE*(SIZE+5)];\
2498
    uint8_t * const full_mid= full + SIZE*2;\
2499
    int16_t tmp[SIZE*(SIZE+5)];\
2500
    uint8_t halfV[SIZE*SIZE];\
2501
    uint8_t halfHV[SIZE*SIZE];\
2502
    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2503
    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2504
    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505
    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2506
}\
2507

    
2508
#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2509
//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2510
#define op_put(a, b)  a = cm[((b) + 16)>>5]
2511
#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2512
#define op2_put(a, b)  a = cm[((b) + 512)>>10]
2513

    
2514
H264_LOWPASS(put_       , op_put, op2_put)
2515
H264_LOWPASS(avg_       , op_avg, op2_avg)
2516
H264_MC(put_, 2)
2517
H264_MC(put_, 4)
2518
H264_MC(put_, 8)
2519
H264_MC(put_, 16)
2520
H264_MC(avg_, 4)
2521
H264_MC(avg_, 8)
2522
H264_MC(avg_, 16)
2523

    
2524
#undef op_avg
2525
#undef op_put
2526
#undef op2_avg
2527
#undef op2_put
2528
#endif
2529

    
2530
#define put_h264_qpel8_mc00_c  ff_put_pixels8x8_c
2531
#define avg_h264_qpel8_mc00_c  ff_avg_pixels8x8_c
2532
#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2533
#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2534

    
2535
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2536
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2537
    int i;
2538

    
2539
    for(i=0; i<h; i++){
2540
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2541
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2542
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2543
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2544
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2545
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2546
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2547
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2548
        dst+=dstStride;
2549
        src+=srcStride;
2550
    }
2551
}
2552

    
2553
void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2554
    put_pixels8_c(dst, src, stride, 8);
2555
}
2556
void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2557
    avg_pixels8_c(dst, src, stride, 8);
2558
}
2559
void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2560
    put_pixels16_c(dst, src, stride, 16);
2561
}
2562
void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2563
    avg_pixels16_c(dst, src, stride, 16);
2564
}
2565

    
2566
#if CONFIG_RV40_DECODER
2567
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568
    put_pixels16_xy2_c(dst, src, stride, 16);
2569
}
2570
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571
    avg_pixels16_xy2_c(dst, src, stride, 16);
2572
}
2573
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574
    put_pixels8_xy2_c(dst, src, stride, 8);
2575
}
2576
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2577
    avg_pixels8_xy2_c(dst, src, stride, 8);
2578
}
2579
#endif /* CONFIG_RV40_DECODER */
2580

    
2581
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2582
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2583
    int i;
2584

    
2585
    for(i=0; i<w; i++){
2586
        const int src_1= src[ -srcStride];
2587
        const int src0 = src[0          ];
2588
        const int src1 = src[  srcStride];
2589
        const int src2 = src[2*srcStride];
2590
        const int src3 = src[3*srcStride];
2591
        const int src4 = src[4*srcStride];
2592
        const int src5 = src[5*srcStride];
2593
        const int src6 = src[6*srcStride];
2594
        const int src7 = src[7*srcStride];
2595
        const int src8 = src[8*srcStride];
2596
        const int src9 = src[9*srcStride];
2597
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2598
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2599
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2600
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2601
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2602
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2603
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2604
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2605
        src++;
2606
        dst++;
2607
    }
2608
}
2609

    
2610
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611
    uint8_t half[64];
2612
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2614
}
2615

    
2616
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2618
}
2619

    
2620
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621
    uint8_t half[64];
2622
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2624
}
2625

    
2626
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2628
}
2629

    
2630
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631
    uint8_t halfH[88];
2632
    uint8_t halfV[64];
2633
    uint8_t halfHV[64];
2634
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638
}
2639
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640
    uint8_t halfH[88];
2641
    uint8_t halfV[64];
2642
    uint8_t halfHV[64];
2643
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647
}
2648
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649
    uint8_t halfH[88];
2650
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2652
}
2653

    
2654
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2656
    int x;
2657
    const int strength= ff_h263_loop_filter_strength[qscale];
2658

    
2659
    for(x=0; x<8; x++){
2660
        int d1, d2, ad1;
2661
        int p0= src[x-2*stride];
2662
        int p1= src[x-1*stride];
2663
        int p2= src[x+0*stride];
2664
        int p3= src[x+1*stride];
2665
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2666

    
2667
        if     (d<-2*strength) d1= 0;
2668
        else if(d<-  strength) d1=-2*strength - d;
2669
        else if(d<   strength) d1= d;
2670
        else if(d< 2*strength) d1= 2*strength - d;
2671
        else                   d1= 0;
2672

    
2673
        p1 += d1;
2674
        p2 -= d1;
2675
        if(p1&256) p1= ~(p1>>31);
2676
        if(p2&256) p2= ~(p2>>31);
2677

    
2678
        src[x-1*stride] = p1;
2679
        src[x+0*stride] = p2;
2680

    
2681
        ad1= FFABS(d1)>>1;
2682

    
2683
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2684

    
2685
        src[x-2*stride] = p0 - d2;
2686
        src[x+  stride] = p3 + d2;
2687
    }
2688
    }
2689
}
2690

    
2691
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2692
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2693
    int y;
2694
    const int strength= ff_h263_loop_filter_strength[qscale];
2695

    
2696
    for(y=0; y<8; y++){
2697
        int d1, d2, ad1;
2698
        int p0= src[y*stride-2];
2699
        int p1= src[y*stride-1];
2700
        int p2= src[y*stride+0];
2701
        int p3= src[y*stride+1];
2702
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2703

    
2704
        if     (d<-2*strength) d1= 0;
2705
        else if(d<-  strength) d1=-2*strength - d;
2706
        else if(d<   strength) d1= d;
2707
        else if(d< 2*strength) d1= 2*strength - d;
2708
        else                   d1= 0;
2709

    
2710
        p1 += d1;
2711
        p2 -= d1;
2712
        if(p1&256) p1= ~(p1>>31);
2713
        if(p2&256) p2= ~(p2>>31);
2714

    
2715
        src[y*stride-1] = p1;
2716
        src[y*stride+0] = p2;
2717

    
2718
        ad1= FFABS(d1)>>1;
2719

    
2720
        d2= av_clip((p0-p3)/4, -ad1, ad1);
2721

    
2722
        src[y*stride-2] = p0 - d2;
2723
        src[y*stride+1] = p3 + d2;
2724
    }
2725
    }
2726
}
2727

    
2728
static void h261_loop_filter_c(uint8_t *src, int stride){
2729
    int x,y,xy,yz;
2730
    int temp[64];
2731

    
2732
    for(x=0; x<8; x++){
2733
        temp[x      ] = 4*src[x           ];
2734
        temp[x + 7*8] = 4*src[x + 7*stride];
2735
    }
2736
    for(y=1; y<7; y++){
2737
        for(x=0; x<8; x++){
2738
            xy = y * stride + x;
2739
            yz = y * 8 + x;
2740
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2741
        }
2742
    }
2743

    
2744
    for(y=0; y<8; y++){
2745
        src[  y*stride] = (temp[  y*8] + 2)>>2;
2746
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2747
        for(x=1; x<7; x++){
2748
            xy = y * stride + x;
2749
            yz = y * 8 + x;
2750
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2751
        }
2752
    }
2753
}
2754

    
2755
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2756
{
2757
    int s, i;
2758

    
2759
    s = 0;
2760
    for(i=0;i<h;i++) {
2761
        s += abs(pix1[0] - pix2[0]);
2762
        s += abs(pix1[1] - pix2[1]);
2763
        s += abs(pix1[2] - pix2[2]);
2764
        s += abs(pix1[3] - pix2[3]);
2765
        s += abs(pix1[4] - pix2[4]);
2766
        s += abs(pix1[5] - pix2[5]);
2767
        s += abs(pix1[6] - pix2[6]);
2768
        s += abs(pix1[7] - pix2[7]);
2769
        s += abs(pix1[8] - pix2[8]);
2770
        s += abs(pix1[9] - pix2[9]);
2771
        s += abs(pix1[10] - pix2[10]);
2772
        s += abs(pix1[11] - pix2[11]);
2773
        s += abs(pix1[12] - pix2[12]);
2774
        s += abs(pix1[13] - pix2[13]);
2775
        s += abs(pix1[14] - pix2[14]);
2776
        s += abs(pix1[15] - pix2[15]);
2777
        pix1 += line_size;
2778
        pix2 += line_size;
2779
    }
2780
    return s;
2781
}
2782

    
2783
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2784
{
2785
    int s, i;
2786

    
2787
    s = 0;
2788
    for(i=0;i<h;i++) {
2789
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2790
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2791
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2792
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2793
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2794
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2795
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2796
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2797
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2798
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2799
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2800
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2801
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2802
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2803
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2804
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2805
        pix1 += line_size;
2806
        pix2 += line_size;
2807
    }
2808
    return s;
2809
}
2810

    
2811
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2812
{
2813
    int s, i;
2814
    uint8_t *pix3 = pix2 + line_size;
2815

    
2816
    s = 0;
2817
    for(i=0;i<h;i++) {
2818
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2819
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2820
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2821
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2822
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2823
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2824
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2825
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2826
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2827
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2828
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2829
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2830
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2831
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2832
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2833
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2834
        pix1 += line_size;
2835
        pix2 += line_size;
2836
        pix3 += line_size;
2837
    }
2838
    return s;
2839
}
2840

    
2841
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2842
{
2843
    int s, i;
2844
    uint8_t *pix3 = pix2 + line_size;
2845

    
2846
    s = 0;
2847
    for(i=0;i<h;i++) {
2848
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2849
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2850
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2851
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2852
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2853
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2854
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2855
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2856
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2857
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2858
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2859
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2860
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2861
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2862
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2863
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2864
        pix1 += line_size;
2865
        pix2 += line_size;
2866
        pix3 += line_size;
2867
    }
2868
    return s;
2869
}
2870

    
2871
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2872
{
2873
    int s, i;
2874

    
2875
    s = 0;
2876
    for(i=0;i<h;i++) {
2877
        s += abs(pix1[0] - pix2[0]);
2878
        s += abs(pix1[1] - pix2[1]);
2879
        s += abs(pix1[2] - pix2[2]);
2880
        s += abs(pix1[3] - pix2[3]);
2881
        s += abs(pix1[4] - pix2[4]);
2882
        s += abs(pix1[5] - pix2[5]);
2883
        s += abs(pix1[6] - pix2[6]);
2884
        s += abs(pix1[7] - pix2[7]);
2885
        pix1 += line_size;
2886
        pix2 += line_size;
2887
    }
2888
    return s;
2889
}
2890

    
2891
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2892
{
2893
    int s, i;
2894

    
2895
    s = 0;
2896
    for(i=0;i<h;i++) {
2897
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2898
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2899
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2900
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2901
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2902
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2903
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2904
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2905
        pix1 += line_size;
2906
        pix2 += line_size;
2907
    }
2908
    return s;
2909
}
2910

    
2911
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2912
{
2913
    int s, i;
2914
    uint8_t *pix3 = pix2 + line_size;
2915

    
2916
    s = 0;
2917
    for(i=0;i<h;i++) {
2918
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2919
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2920
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2921
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2922
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2923
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2924
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2925
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2926
        pix1 += line_size;
2927
        pix2 += line_size;
2928
        pix3 += line_size;
2929
    }
2930
    return s;
2931
}
2932

    
2933
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934
{
2935
    int s, i;
2936
    uint8_t *pix3 = pix2 + line_size;
2937

    
2938
    s = 0;
2939
    for(i=0;i<h;i++) {
2940
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2941
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2942
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2943
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2944
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2945
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2946
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2947
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2948
        pix1 += line_size;
2949
        pix2 += line_size;
2950
        pix3 += line_size;
2951
    }
2952
    return s;
2953
}
2954

    
2955
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2956
    MpegEncContext *c = v;
2957
    int score1=0;
2958
    int score2=0;
2959
    int x,y;
2960

    
2961
    for(y=0; y<h; y++){
2962
        for(x=0; x<16; x++){
2963
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2964
        }
2965
        if(y+1<h){
2966
            for(x=0; x<15; x++){
2967
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2968
                             - s1[x+1] + s1[x+1+stride])
2969
                        -FFABS(  s2[x  ] - s2[x  +stride]
2970
                             - s2[x+1] + s2[x+1+stride]);
2971
            }
2972
        }
2973
        s1+= stride;
2974
        s2+= stride;
2975
    }
2976

    
2977
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2978
    else  return score1 + FFABS(score2)*8;
2979
}
2980

    
2981
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2982
    MpegEncContext *c = v;
2983
    int score1=0;
2984
    int score2=0;
2985
    int x,y;
2986

    
2987
    for(y=0; y<h; y++){
2988
        for(x=0; x<8; x++){
2989
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
2990
        }
2991
        if(y+1<h){
2992
            for(x=0; x<7; x++){
2993
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
2994
                             - s1[x+1] + s1[x+1+stride])
2995
                        -FFABS(  s2[x  ] - s2[x  +stride]
2996
                             - s2[x+1] + s2[x+1+stride]);
2997
            }
2998
        }
2999
        s1+= stride;
3000
        s2+= stride;
3001
    }
3002

    
3003
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3004
    else  return score1 + FFABS(score2)*8;
3005
}
3006

    
3007
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3008
    int i;
3009
    unsigned int sum=0;
3010

    
3011
    for(i=0; i<8*8; i++){
3012
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3013
        int w= weight[i];
3014
        b>>= RECON_SHIFT;
3015
        assert(-512<b && b<512);
3016

    
3017
        sum += (w*b)*(w*b)>>4;
3018
    }
3019
    return sum>>2;
3020
}
3021

    
3022
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3023
    int i;
3024

    
3025
    for(i=0; i<8*8; i++){
3026
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3027
    }
3028
}
3029

    
3030
/**
3031
 * permutes an 8x8 block.
3032
 * @param block the block which will be permuted according to the given permutation vector
3033
 * @param permutation the permutation vector
3034
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3035
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3036
 *                  (inverse) permutated to scantable order!
3037
 */
3038
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3039
{
3040
    int i;
3041
    DCTELEM temp[64];
3042

    
3043
    if(last<=0) return;
3044
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3045

    
3046
    for(i=0; i<=last; i++){
3047
        const int j= scantable[i];
3048
        temp[j]= block[j];
3049
        block[j]=0;
3050
    }
3051

    
3052
    for(i=0; i<=last; i++){
3053
        const int j= scantable[i];
3054
        const int perm_j= permutation[j];
3055
        block[perm_j]= temp[j];
3056
    }
3057
}
3058

    
3059
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3060
    return 0;
3061
}
3062

    
3063
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3064
    int i;
3065

    
3066
    memset(cmp, 0, sizeof(void*)*6);
3067

    
3068
    for(i=0; i<6; i++){
3069
        switch(type&0xFF){
3070
        case FF_CMP_SAD:
3071
            cmp[i]= c->sad[i];
3072
            break;
3073
        case FF_CMP_SATD:
3074
            cmp[i]= c->hadamard8_diff[i];
3075
            break;
3076
        case FF_CMP_SSE:
3077
            cmp[i]= c->sse[i];
3078
            break;
3079
        case FF_CMP_DCT:
3080
            cmp[i]= c->dct_sad[i];
3081
            break;
3082
        case FF_CMP_DCT264:
3083
            cmp[i]= c->dct264_sad[i];
3084
            break;
3085
        case FF_CMP_DCTMAX:
3086
            cmp[i]= c->dct_max[i];
3087
            break;
3088
        case FF_CMP_PSNR:
3089
            cmp[i]= c->quant_psnr[i];
3090
            break;
3091
        case FF_CMP_BIT:
3092
            cmp[i]= c->bit[i];
3093
            break;
3094
        case FF_CMP_RD:
3095
            cmp[i]= c->rd[i];
3096
            break;
3097
        case FF_CMP_VSAD:
3098
            cmp[i]= c->vsad[i];
3099
            break;
3100
        case FF_CMP_VSSE:
3101
            cmp[i]= c->vsse[i];
3102
            break;
3103
        case FF_CMP_ZERO:
3104
            cmp[i]= zero_cmp;
3105
            break;
3106
        case FF_CMP_NSSE:
3107
            cmp[i]= c->nsse[i];
3108
            break;
3109
#if CONFIG_DWT
3110
        case FF_CMP_W53:
3111
            cmp[i]= c->w53[i];
3112
            break;
3113
        case FF_CMP_W97:
3114
            cmp[i]= c->w97[i];
3115
            break;
3116
#endif
3117
        default:
3118
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3119
        }
3120
    }
3121
}
3122

    
3123
static void clear_block_c(DCTELEM *block)
3124
{
3125
    memset(block, 0, sizeof(DCTELEM)*64);
3126
}
3127

    
3128
/**
3129
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3130
 */
3131
static void clear_blocks_c(DCTELEM *blocks)
3132
{
3133
    memset(blocks, 0, sizeof(DCTELEM)*6*64);
3134
}
3135

    
3136
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3137
    long i;
3138
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3139
        long a = *(long*)(src+i);
3140
        long b = *(long*)(dst+i);
3141
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3142
    }
3143
    for(; i<w; i++)
3144
        dst[i+0] += src[i+0];
3145
}
3146

    
3147
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3148
    long i;
3149
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3150
        long a = *(long*)(src1+i);
3151
        long b = *(long*)(src2+i);
3152
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3153
    }
3154
    for(; i<w; i++)
3155
        dst[i] = src1[i]+src2[i];
3156
}
3157

    
3158
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3159
    long i;
3160
#if !HAVE_FAST_UNALIGNED
3161
    if((long)src2 & (sizeof(long)-1)){
3162
        for(i=0; i+7<w; i+=8){
3163
            dst[i+0] = src1[i+0]-src2[i+0];
3164
            dst[i+1] = src1[i+1]-src2[i+1];
3165
            dst[i+2] = src1[i+2]-src2[i+2];
3166
            dst[i+3] = src1[i+3]-src2[i+3];
3167
            dst[i+4] = src1[i+4]-src2[i+4];
3168
            dst[i+5] = src1[i+5]-src2[i+5];
3169
            dst[i+6] = src1[i+6]-src2[i+6];
3170
            dst[i+7] = src1[i+7]-src2[i+7];
3171
        }
3172
    }else
3173
#endif
3174
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3175
        long a = *(long*)(src1+i);
3176
        long b = *(long*)(src2+i);
3177
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3178
    }
3179
    for(; i<w; i++)
3180
        dst[i+0] = src1[i+0]-src2[i+0];
3181
}
3182

    
3183
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3184
    int i;
3185
    uint8_t l, lt;
3186

    
3187
    l= *left;
3188
    lt= *left_top;
3189

    
3190
    for(i=0; i<w; i++){
3191
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3192
        lt= src1[i];
3193
        dst[i]= l;
3194
    }
3195

    
3196
    *left= l;
3197
    *left_top= lt;
3198
}
3199

    
3200
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3201
    int i;
3202
    uint8_t l, lt;
3203

    
3204
    l= *left;
3205
    lt= *left_top;
3206

    
3207
    for(i=0; i<w; i++){
3208
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3209
        lt= src1[i];
3210
        l= src2[i];
3211
        dst[i]= l - pred;
3212
    }
3213

    
3214
    *left= l;
3215
    *left_top= lt;
3216
}
3217

    
3218
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3219
    int i;
3220

    
3221
    for(i=0; i<w-1; i++){
3222
        acc+= src[i];
3223
        dst[i]= acc;
3224
        i++;
3225
        acc+= src[i];
3226
        dst[i]= acc;
3227
    }
3228

    
3229
    for(; i<w; i++){
3230
        acc+= src[i];
3231
        dst[i]= acc;
3232
    }
3233

    
3234
    return acc;
3235
}
3236

    
3237
#if HAVE_BIGENDIAN
3238
#define B 3
3239
#define G 2
3240
#define R 1
3241
#define A 0
3242
#else
3243
#define B 0
3244
#define G 1
3245
#define R 2
3246
#define A 3
3247
#endif
3248
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3249
    int i;
3250
    int r,g,b,a;
3251
    r= *red;
3252
    g= *green;
3253
    b= *blue;
3254
    a= *alpha;
3255

    
3256
    for(i=0; i<w; i++){
3257
        b+= src[4*i+B];
3258
        g+= src[4*i+G];
3259
        r+= src[4*i+R];
3260
        a+= src[4*i+A];
3261

    
3262
        dst[4*i+B]= b;
3263
        dst[4*i+G]= g;
3264
        dst[4*i+R]= r;
3265
        dst[4*i+A]= a;
3266
    }
3267

    
3268
    *red= r;
3269
    *green= g;
3270
    *blue= b;
3271
    *alpha= a;
3272
}
3273
#undef B
3274
#undef G
3275
#undef R
3276
#undef A
3277

    
3278
#define BUTTERFLY2(o1,o2,i1,i2) \
3279
o1= (i1)+(i2);\
3280
o2= (i1)-(i2);
3281

    
3282
#define BUTTERFLY1(x,y) \
3283
{\
3284
    int a,b;\
3285
    a= x;\
3286
    b= y;\
3287
    x= a+b;\
3288
    y= a-b;\
3289
}
3290

    
3291
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3292

    
3293
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3294
    int i;
3295
    int temp[64];
3296
    int sum=0;
3297

    
3298
    assert(h==8);
3299

    
3300
    for(i=0; i<8; i++){
3301
        //FIXME try pointer walks
3302
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3303
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3304
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3305
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3306

    
3307
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3308
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3309
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3310
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3311

    
3312
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3313
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3314
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3315
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3316
    }
3317

    
3318
    for(i=0; i<8; i++){
3319
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3320
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3321
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3322
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3323

    
3324
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3325
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3326
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3327
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3328

    
3329
        sum +=
3330
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3331
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3332
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3333
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3334
    }
3335
#if 0
3336
static int maxi=0;
3337
if(sum>maxi){
3338
    maxi=sum;
3339
    printf("MAX:%d\n", maxi);
3340
}
3341
#endif
3342
    return sum;
3343
}
3344

    
3345
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3346
    int i;
3347
    int temp[64];
3348
    int sum=0;
3349

    
3350
    assert(h==8);
3351

    
3352
    for(i=0; i<8; i++){
3353
        //FIXME try pointer walks
3354
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3355
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3356
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3357
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3358

    
3359
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3360
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3361
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3362
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3363

    
3364
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3365
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3366
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3367
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3368
    }
3369

    
3370
    for(i=0; i<8; i++){
3371
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3372
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3373
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3374
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3375

    
3376
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3377
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3378
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3379
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3380

    
3381
        sum +=
3382
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3383
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3384
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3385
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3386
    }
3387

    
3388
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3389

    
3390
    return sum;
3391
}
3392

    
3393
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3394
    MpegEncContext * const s= (MpegEncContext *)c;
3395
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3396

    
3397
    assert(h==8);
3398

    
3399
    s->dsp.diff_pixels(temp, src1, src2, stride);
3400
    s->dsp.fdct(temp);
3401
    return s->dsp.sum_abs_dctelem(temp);
3402
}
3403

    
3404
#if CONFIG_GPL
3405
#define DCT8_1D {\
3406
    const int s07 = SRC(0) + SRC(7);\
3407
    const int s16 = SRC(1) + SRC(6);\
3408
    const int s25 = SRC(2) + SRC(5);\
3409
    const int s34 = SRC(3) + SRC(4);\
3410
    const int a0 = s07 + s34;\
3411
    const int a1 = s16 + s25;\
3412
    const int a2 = s07 - s34;\
3413
    const int a3 = s16 - s25;\
3414
    const int d07 = SRC(0) - SRC(7);\
3415
    const int d16 = SRC(1) - SRC(6);\
3416
    const int d25 = SRC(2) - SRC(5);\
3417
    const int d34 = SRC(3) - SRC(4);\
3418
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
3419
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
3420
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
3421
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
3422
    DST(0,  a0 + a1     ) ;\
3423
    DST(1,  a4 + (a7>>2)) ;\
3424
    DST(2,  a2 + (a3>>1)) ;\
3425
    DST(3,  a5 + (a6>>2)) ;\
3426
    DST(4,  a0 - a1     ) ;\
3427
    DST(5,  a6 - (a5>>2)) ;\
3428
    DST(6, (a2>>1) - a3 ) ;\
3429
    DST(7, (a4>>2) - a7 ) ;\
3430
}
3431

    
3432
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3433
    MpegEncContext * const s= (MpegEncContext *)c;
3434
    DCTELEM dct[8][8];
3435
    int i;
3436
    int sum=0;
3437

    
3438
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
3439

    
3440
#define SRC(x) dct[i][x]
3441
#define DST(x,v) dct[i][x]= v
3442
    for( i = 0; i < 8; i++ )
3443
        DCT8_1D
3444
#undef SRC
3445
#undef DST
3446

    
3447
#define SRC(x) dct[x][i]
3448
#define DST(x,v) sum += FFABS(v)
3449
    for( i = 0; i < 8; i++ )
3450
        DCT8_1D
3451
#undef SRC
3452
#undef DST
3453
    return sum;
3454
}
3455
#endif
3456

    
3457
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3458
    MpegEncContext * const s= (MpegEncContext *)c;
3459
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3460
    int sum=0, i;
3461

    
3462
    assert(h==8);
3463

    
3464
    s->dsp.diff_pixels(temp, src1, src2, stride);
3465
    s->dsp.fdct(temp);
3466

    
3467
    for(i=0; i<64; i++)
3468
        sum= FFMAX(sum, FFABS(temp[i]));
3469

    
3470
    return sum;
3471
}
3472

    
3473
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474
    MpegEncContext * const s= (MpegEncContext *)c;
3475
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3476
    DCTELEM * const bak = temp+64;
3477
    int sum=0, i;
3478

    
3479
    assert(h==8);
3480
    s->mb_intra=0;
3481

    
3482
    s->dsp.diff_pixels(temp, src1, src2, stride);
3483

    
3484
    memcpy(bak, temp, 64*sizeof(DCTELEM));
3485

    
3486
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3487
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
3488
    ff_simple_idct(temp); //FIXME
3489

    
3490
    for(i=0; i<64; i++)
3491
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3492

    
3493
    return sum;
3494
}
3495

    
3496
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3497
    MpegEncContext * const s= (MpegEncContext *)c;
3498
    const uint8_t *scantable= s->intra_scantable.permutated;
3499
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3500
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3501
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3502
    int i, last, run, bits, level, distortion, start_i;
3503
    const int esc_length= s->ac_esc_length;
3504
    uint8_t * length;
3505
    uint8_t * last_length;
3506

    
3507
    assert(h==8);
3508

    
3509
    copy_block8(lsrc1, src1, 8, stride, 8);
3510
    copy_block8(lsrc2, src2, 8, stride, 8);
3511

    
3512
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3513

    
3514
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3515

    
3516
    bits=0;
3517

    
3518
    if (s->mb_intra) {
3519
        start_i = 1;
3520
        length     = s->intra_ac_vlc_length;
3521
        last_length= s->intra_ac_vlc_last_length;
3522
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3523
    } else {
3524
        start_i = 0;
3525
        length     = s->inter_ac_vlc_length;
3526
        last_length= s->inter_ac_vlc_last_length;
3527
    }
3528

    
3529
    if(last>=start_i){
3530
        run=0;
3531
        for(i=start_i; i<last; i++){
3532
            int j= scantable[i];
3533
            level= temp[j];
3534

    
3535
            if(level){
3536
                level+=64;
3537
                if((level&(~127)) == 0){
3538
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3539
                }else
3540
                    bits+= esc_length;
3541
                run=0;
3542
            }else
3543
                run++;
3544
        }
3545
        i= scantable[last];
3546

    
3547
        level= temp[i] + 64;
3548

    
3549
        assert(level - 64);
3550

    
3551
        if((level&(~127)) == 0){
3552
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3553
        }else
3554
            bits+= esc_length;
3555

    
3556
    }
3557

    
3558
    if(last>=0){
3559
        if(s->mb_intra)
3560
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
3561
        else
3562
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
3563
    }
3564

    
3565
    s->dsp.idct_add(lsrc2, 8, temp);
3566

    
3567
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3568

    
3569
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3570
}
3571

    
3572
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3573
    MpegEncContext * const s= (MpegEncContext *)c;
3574
    const uint8_t *scantable= s->intra_scantable.permutated;
3575
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3576
    int i, last, run, bits, level, start_i;
3577
    const int esc_length= s->ac_esc_length;
3578
    uint8_t * length;
3579
    uint8_t * last_length;
3580

    
3581
    assert(h==8);
3582

    
3583
    s->dsp.diff_pixels(temp, src1, src2, stride);
3584

    
3585
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3586

    
3587
    bits=0;
3588

    
3589
    if (s->mb_intra) {
3590
        start_i = 1;
3591
        length     = s->intra_ac_vlc_length;
3592
        last_length= s->intra_ac_vlc_last_length;
3593
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3594
    } else {
3595
        start_i = 0;
3596
        length     = s->inter_ac_vlc_length;
3597
        last_length= s->inter_ac_vlc_last_length;
3598
    }
3599

    
3600
    if(last>=start_i){
3601
        run=0;
3602
        for(i=start_i; i<last; i++){
3603
            int j= scantable[i];
3604
            level= temp[j];
3605

    
3606
            if(level){
3607
                level+=64;
3608
                if((level&(~127)) == 0){
3609
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
3610
                }else
3611
                    bits+= esc_length;
3612
                run=0;
3613
            }else
3614
                run++;
3615
        }
3616
        i= scantable[last];
3617

    
3618
        level= temp[i] + 64;
3619

    
3620
        assert(level - 64);
3621

    
3622
        if((level&(~127)) == 0){
3623
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3624
        }else
3625
            bits+= esc_length;
3626
    }
3627

    
3628
    return bits;
3629
}
3630

    
3631
#define VSAD_INTRA(size) \
3632
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3633
    int score=0;                                                                                            \
3634
    int x,y;                                                                                                \
3635
                                                                                                            \
3636
    for(y=1; y<h; y++){                                                                                     \
3637
        for(x=0; x<size; x+=4){                                                                             \
3638
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3639
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3640
        }                                                                                                   \
3641
        s+= stride;                                                                                         \
3642
    }                                                                                                       \
3643
                                                                                                            \
3644
    return score;                                                                                           \
3645
}
3646
VSAD_INTRA(8)
3647
VSAD_INTRA(16)
3648

    
3649
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3650
    int score=0;
3651
    int x,y;
3652

    
3653
    for(y=1; y<h; y++){
3654
        for(x=0; x<16; x++){
3655
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3656
        }
3657
        s1+= stride;
3658
        s2+= stride;
3659
    }
3660

    
3661
    return score;
3662
}
3663

    
3664
#define SQ(a) ((a)*(a))
3665
#define VSSE_INTRA(size) \
3666
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3667
    int score=0;                                                                                            \
3668
    int x,y;                                                                                                \
3669
                                                                                                            \
3670
    for(y=1; y<h; y++){                                                                                     \
3671
        for(x=0; x<size; x+=4){                                                                               \
3672
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3673
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3674
        }                                                                                                   \
3675
        s+= stride;                                                                                         \
3676
    }                                                                                                       \
3677
                                                                                                            \
3678
    return score;                                                                                           \
3679
}
3680
VSSE_INTRA(8)
3681
VSSE_INTRA(16)
3682

    
3683
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3684
    int score=0;
3685
    int x,y;
3686

    
3687
    for(y=1; y<h; y++){
3688
        for(x=0; x<16; x++){
3689
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3690
        }
3691
        s1+= stride;
3692
        s2+= stride;
3693
    }
3694

    
3695
    return score;
3696
}
3697

    
3698
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3699
                               int size){
3700
    int score=0;
3701
    int i;
3702
    for(i=0; i<size; i++)
3703
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3704
    return score;
3705
}
3706

    
3707
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3708
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3709
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3710
#if CONFIG_GPL
3711
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3712
#endif
3713
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3714
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3715
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3716
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3717

    
3718
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3719
    int i;
3720
    for(i=0; i<len; i++)
3721
        dst[i] = src0[i] * src1[i];
3722
}
3723

    
3724
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3725
    int i;
3726
    src1 += len-1;
3727
    for(i=0; i<len; i++)
3728
        dst[i] = src0[i] * src1[-i];
3729
}
3730

    
3731
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3732
    int i;
3733
    for(i=0; i<len; i++)
3734
        dst[i] = src0[i] * src1[i] + src2[i];
3735
}
3736

    
3737
static void vector_fmul_window_c(float *dst, const float *src0,
3738
                                 const float *src1, const float *win, int len)
3739
{
3740
    int i,j;
3741
    dst += len;
3742
    win += len;
3743
    src0+= len;
3744
    for(i=-len, j=len-1; i<0; i++, j--) {
3745
        float s0 = src0[i];
3746
        float s1 = src1[j];
3747
        float wi = win[i];
3748
        float wj = win[j];
3749
        dst[i] = s0*wj - s1*wi;
3750
        dst[j] = s0*wi + s1*wj;
3751
    }
3752
}
3753

    
3754
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3755
                                 int len)
3756
{
3757
    int i;
3758
    for (i = 0; i < len; i++)
3759
        dst[i] = src[i] * mul;
3760
}
3761

    
3762
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3763
                                      const float **sv, float mul, int len)
3764
{
3765
    int i;
3766
    for (i = 0; i < len; i += 2, sv++) {
3767
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3768
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3769
    }
3770
}
3771

    
3772
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3773
                                      const float **sv, float mul, int len)
3774
{
3775
    int i;
3776
    for (i = 0; i < len; i += 4, sv++) {
3777
        dst[i  ] = src[i  ] * sv[0][0] * mul;
3778
        dst[i+1] = src[i+1] * sv[0][1] * mul;
3779
        dst[i+2] = src[i+2] * sv[0][2] * mul;
3780
        dst[i+3] = src[i+3] * sv[0][3] * mul;
3781
    }
3782
}
3783

    
3784
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3785
                               int len)
3786
{
3787
    int i;
3788
    for (i = 0; i < len; i += 2, sv++) {
3789
        dst[i  ] = sv[0][0] * mul;
3790
        dst[i+1] = sv[0][1] * mul;
3791
    }
3792
}
3793

    
3794
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3795
                               int len)
3796
{
3797
    int i;
3798
    for (i = 0; i < len; i += 4, sv++) {
3799
        dst[i  ] = sv[0][0] * mul;
3800
        dst[i+1] = sv[0][1] * mul;
3801
        dst[i+2] = sv[0][2] * mul;
3802
        dst[i+3] = sv[0][3] * mul;
3803
    }
3804
}
3805

    
3806
static void butterflies_float_c(float *restrict v1, float *restrict v2,
3807
                                int len)
3808
{
3809
    int i;
3810
    for (i = 0; i < len; i++) {
3811
        float t = v1[i] - v2[i];
3812
        v1[i] += v2[i];
3813
        v2[i] = t;
3814
    }
3815
}
3816

    
3817
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3818
{
3819
    float p = 0.0;
3820
    int i;
3821

    
3822
    for (i = 0; i < len; i++)
3823
        p += v1[i] * v2[i];
3824

    
3825
    return p;
3826
}
3827

    
3828
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3829
                   uint32_t maxi, uint32_t maxisign)
3830
{
3831

    
3832
    if(a > mini) return mini;
3833
    else if((a^(1<<31)) > maxisign) return maxi;
3834
    else return a;
3835
}
3836

    
3837
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3838
    int i;
3839
    uint32_t mini = *(uint32_t*)min;
3840
    uint32_t maxi = *(uint32_t*)max;
3841
    uint32_t maxisign = maxi ^ (1<<31);
3842
    uint32_t *dsti = (uint32_t*)dst;
3843
    const uint32_t *srci = (const uint32_t*)src;
3844
    for(i=0; i<len; i+=8) {
3845
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3846
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3847
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3848
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3849
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3850
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3851
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3852
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3853
    }
3854
}
3855
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3856
    int i;
3857
    if(min < 0 && max > 0) {
3858
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3859
    } else {
3860
        for(i=0; i < len; i+=8) {
3861
            dst[i    ] = av_clipf(src[i    ], min, max);
3862
            dst[i + 1] = av_clipf(src[i + 1], min, max);
3863
            dst[i + 2] = av_clipf(src[i + 2], min, max);
3864
            dst[i + 3] = av_clipf(src[i + 3], min, max);
3865
            dst[i + 4] = av_clipf(src[i + 4], min, max);
3866
            dst[i + 5] = av_clipf(src[i + 5], min, max);
3867
            dst[i + 6] = av_clipf(src[i + 6], min, max);
3868
            dst[i + 7] = av_clipf(src[i + 7], min, max);
3869
        }
3870
    }
3871
}
3872

    
3873
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3874
{
3875
    int res = 0;
3876

    
3877
    while (order--)
3878
        res += (*v1++ * *v2++) >> shift;
3879

    
3880
    return res;
3881
}
3882

    
3883
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3884
{
3885
    int res = 0;
3886
    while (order--) {
3887
        res   += *v1 * *v2++;
3888
        *v1++ += mul * *v3++;
3889
    }
3890
    return res;
3891
}
3892

    
3893
static void apply_window_int16_c(int16_t *output, const int16_t *input,
3894
                                 const int16_t *window, unsigned int len)
3895
{
3896
    int i;
3897
    int len2 = len >> 1;
3898

    
3899
    for (i = 0; i < len2; i++) {
3900
        int16_t w       = window[i];
3901
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
3902
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
3903
    }
3904
}
3905

    
3906
#define W0 2048
3907
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3908
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3909
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3910
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3911
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3912
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3913
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3914

    
3915
static void wmv2_idct_row(short * b)
3916
{
3917
    int s1,s2;
3918
    int a0,a1,a2,a3,a4,a5,a6,a7;
3919
    /*step 1*/
3920
    a1 = W1*b[1]+W7*b[7];
3921
    a7 = W7*b[1]-W1*b[7];
3922
    a5 = W5*b[5]+W3*b[3];
3923
    a3 = W3*b[5]-W5*b[3];
3924
    a2 = W2*b[2]+W6*b[6];
3925
    a6 = W6*b[2]-W2*b[6];
3926
    a0 = W0*b[0]+W0*b[4];
3927
    a4 = W0*b[0]-W0*b[4];
3928
    /*step 2*/
3929
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3930
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3931
    /*step 3*/
3932
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3933
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
3934
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
3935
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3936
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3937
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
3938
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
3939
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3940
}
3941
static void wmv2_idct_col(short * b)
3942
{
3943
    int s1,s2;
3944
    int a0,a1,a2,a3,a4,a5,a6,a7;
3945
    /*step 1, with extended precision*/
3946
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3947
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3948
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3949
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3950
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3951
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3952
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3953
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3954
    /*step 2*/
3955
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
3956
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
3957
    /*step 3*/
3958
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3959
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3960
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3961
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3962

    
3963
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3964
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3965
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3966
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3967
}
3968
void ff_wmv2_idct_c(short * block){
3969
    int i;
3970

    
3971
    for(i=0;i<64;i+=8){
3972
        wmv2_idct_row(block+i);
3973
    }
3974
    for(i=0;i<8;i++){
3975
        wmv2_idct_col(block+i);
3976
    }
3977
}
3978
/* XXX: those functions should be suppressed ASAP when all IDCTs are
3979
 converted */
3980
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3981
{
3982
    ff_wmv2_idct_c(block);
3983
    ff_put_pixels_clamped_c(block, dest, line_size);
3984
}
3985
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3986
{
3987
    ff_wmv2_idct_c(block);
3988
    ff_add_pixels_clamped_c(block, dest, line_size);
3989
}
3990
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3991
{
3992
    j_rev_dct (block);
3993
    ff_put_pixels_clamped_c(block, dest, line_size);
3994
}
3995
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3996
{
3997
    j_rev_dct (block);
3998
    ff_add_pixels_clamped_c(block, dest, line_size);
3999
}
4000

    
4001
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4002
{
4003
    j_rev_dct4 (block);
4004
    put_pixels_clamped4_c(block, dest, line_size);
4005
}
4006
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4007
{
4008
    j_rev_dct4 (block);
4009
    add_pixels_clamped4_c(block, dest, line_size);
4010
}
4011

    
4012
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4013
{
4014
    j_rev_dct2 (block);
4015
    put_pixels_clamped2_c(block, dest, line_size);
4016
}
4017
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4018
{
4019
    j_rev_dct2 (block);
4020
    add_pixels_clamped2_c(block, dest, line_size);
4021
}
4022

    
4023
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4024
{
4025
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4026

    
4027
    dest[0] = cm[(block[0] + 4)>>3];
4028
}
4029
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4030
{
4031
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4032

    
4033
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4034
}
4035

    
4036
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4037

    
4038
/* init static data */
4039
av_cold void dsputil_static_init(void)
4040
{
4041
    int i;
4042

    
4043
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4044
    for(i=0;i<MAX_NEG_CROP;i++) {
4045
        ff_cropTbl[i] = 0;
4046
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4047
    }
4048

    
4049
    for(i=0;i<512;i++) {
4050
        ff_squareTbl[i] = (i - 256) * (i - 256);
4051
    }
4052

    
4053
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4054
}
4055

    
4056
int ff_check_alignment(void){
4057
    static int did_fail=0;
4058
    DECLARE_ALIGNED(16, int, aligned);
4059

    
4060
    if((intptr_t)&aligned & 15){
4061
        if(!did_fail){
4062
#if HAVE_MMX || HAVE_ALTIVEC
4063
            av_log(NULL, AV_LOG_ERROR,
4064
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4065
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
4066
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4067
                "Do not report crashes to FFmpeg developers.\n");
4068
#endif
4069
            did_fail=1;
4070
        }
4071
        return -1;
4072
    }
4073
    return 0;
4074
}
4075

    
4076
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4077
{
4078
    int i;
4079

    
4080
    ff_check_alignment();
4081

    
4082
#if CONFIG_ENCODERS
4083
    if(avctx->dct_algo==FF_DCT_FASTINT) {
4084
        c->fdct = fdct_ifast;
4085
        c->fdct248 = fdct_ifast248;
4086
    }
4087
    else if(avctx->dct_algo==FF_DCT_FAAN) {
4088
        c->fdct = ff_faandct;
4089
        c->fdct248 = ff_faandct248;
4090
    }
4091
    else {
4092
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4093
        c->fdct248 = ff_fdct248_islow;
4094
    }
4095
#endif //CONFIG_ENCODERS
4096

    
4097
    if(avctx->lowres==1){
4098
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4099
            c->idct_put= ff_jref_idct4_put;
4100
            c->idct_add= ff_jref_idct4_add;
4101
        }else{
4102
            c->idct_put= ff_h264_lowres_idct_put_c;
4103
            c->idct_add= ff_h264_lowres_idct_add_c;
4104
        }
4105
        c->idct    = j_rev_dct4;
4106
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4107
    }else if(avctx->lowres==2){
4108
        c->idct_put= ff_jref_idct2_put;
4109
        c->idct_add= ff_jref_idct2_add;
4110
        c->idct    = j_rev_dct2;
4111
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4112
    }else if(avctx->lowres==3){
4113
        c->idct_put= ff_jref_idct1_put;
4114
        c->idct_add= ff_jref_idct1_add;
4115
        c->idct    = j_rev_dct1;
4116
        c->idct_permutation_type= FF_NO_IDCT_PERM;
4117
    }else{
4118
        if(avctx->idct_algo==FF_IDCT_INT){
4119
            c->idct_put= ff_jref_idct_put;
4120
            c->idct_add= ff_jref_idct_add;
4121
            c->idct    = j_rev_dct;
4122
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4123
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4124
                avctx->idct_algo==FF_IDCT_VP3){
4125
            c->idct_put= ff_vp3_idct_put_c;
4126
            c->idct_add= ff_vp3_idct_add_c;
4127
            c->idct    = ff_vp3_idct_c;
4128
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4129
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
4130
            c->idct_put= ff_wmv2_idct_put_c;
4131
            c->idct_add= ff_wmv2_idct_add_c;
4132
            c->idct    = ff_wmv2_idct_c;
4133
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4134
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
4135
            c->idct_put= ff_faanidct_put;
4136
            c->idct_add= ff_faanidct_add;
4137
            c->idct    = ff_faanidct;
4138
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4139
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4140
            c->idct_put= ff_ea_idct_put_c;
4141
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4142
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4143
            c->idct     = ff_bink_idct_c;
4144
            c->idct_add = ff_bink_idct_add_c;
4145
            c->idct_put = ff_bink_idct_put_c;
4146
            c->idct_permutation_type = FF_NO_IDCT_PERM;
4147
        }else{ //accurate/default
4148
            c->idct_put= ff_simple_idct_put;
4149
            c->idct_add= ff_simple_idct_add;
4150
            c->idct    = ff_simple_idct;
4151
            c->idct_permutation_type= FF_NO_IDCT_PERM;
4152
        }
4153
    }
4154

    
4155
    c->get_pixels = get_pixels_c;
4156
    c->diff_pixels = diff_pixels_c;
4157
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
4158
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4159
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4160
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
4161
    c->add_pixels8 = add_pixels8_c;
4162
    c->add_pixels4 = add_pixels4_c;
4163
    c->sum_abs_dctelem = sum_abs_dctelem_c;
4164
    c->emulated_edge_mc = ff_emulated_edge_mc;
4165
    c->gmc1 = gmc1_c;
4166
    c->gmc = ff_gmc_c;
4167
    c->clear_block = clear_block_c;
4168
    c->clear_blocks = clear_blocks_c;
4169
    c->pix_sum = pix_sum_c;
4170
    c->pix_norm1 = pix_norm1_c;
4171

    
4172
    c->fill_block_tab[0] = fill_block16_c;
4173
    c->fill_block_tab[1] = fill_block8_c;
4174
    c->scale_block = scale_block_c;
4175

    
4176
    /* TODO [0] 16  [1] 8 */
4177
    c->pix_abs[0][0] = pix_abs16_c;
4178
    c->pix_abs[0][1] = pix_abs16_x2_c;
4179
    c->pix_abs[0][2] = pix_abs16_y2_c;
4180
    c->pix_abs[0][3] = pix_abs16_xy2_c;
4181
    c->pix_abs[1][0] = pix_abs8_c;
4182
    c->pix_abs[1][1] = pix_abs8_x2_c;
4183
    c->pix_abs[1][2] = pix_abs8_y2_c;
4184
    c->pix_abs[1][3] = pix_abs8_xy2_c;
4185

    
4186
#define dspfunc(PFX, IDX, NUM) \
4187
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4188
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4189
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4190
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4191

    
4192
    dspfunc(put, 0, 16);
4193
    dspfunc(put_no_rnd, 0, 16);
4194
    dspfunc(put, 1, 8);
4195
    dspfunc(put_no_rnd, 1, 8);
4196
    dspfunc(put, 2, 4);
4197
    dspfunc(put, 3, 2);
4198

    
4199
    dspfunc(avg, 0, 16);
4200
    dspfunc(avg_no_rnd, 0, 16);
4201
    dspfunc(avg, 1, 8);
4202
    dspfunc(avg_no_rnd, 1, 8);
4203
    dspfunc(avg, 2, 4);
4204
    dspfunc(avg, 3, 2);
4205
#undef dspfunc
4206

    
4207
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4208
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4209

    
4210
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4211
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4212
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4213
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4214
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4215
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4216
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4217
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4218
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4219

    
4220
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4221
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4222
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4223
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4224
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4225
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4226
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4227
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4228
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4229

    
4230
#define dspfunc(PFX, IDX, NUM) \
4231
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4232
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4233
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4234
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4235
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4236
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4237
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4238
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4239
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4240
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4241
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4242
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4243
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4244
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4245
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4246
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4247

    
4248
    dspfunc(put_qpel, 0, 16);
4249
    dspfunc(put_no_rnd_qpel, 0, 16);
4250

    
4251
    dspfunc(avg_qpel, 0, 16);
4252
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4253

    
4254
    dspfunc(put_qpel, 1, 8);
4255
    dspfunc(put_no_rnd_qpel, 1, 8);
4256

    
4257
    dspfunc(avg_qpel, 1, 8);
4258
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4259

    
4260
    dspfunc(put_h264_qpel, 0, 16);
4261
    dspfunc(put_h264_qpel, 1, 8);
4262
    dspfunc(put_h264_qpel, 2, 4);
4263
    dspfunc(put_h264_qpel, 3, 2);
4264
    dspfunc(avg_h264_qpel, 0, 16);
4265
    dspfunc(avg_h264_qpel, 1, 8);
4266
    dspfunc(avg_h264_qpel, 2, 4);
4267

    
4268
#undef dspfunc
4269
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4270
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4271
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4272
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4273
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4274
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4275

    
4276
    c->draw_edges = draw_edges_c;
4277

    
4278
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4279
    ff_mlp_init(c, avctx);
4280
#endif
4281
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4282
    ff_intrax8dsp_init(c,avctx);
4283
#endif
4284
#if CONFIG_RV30_DECODER
4285
    ff_rv30dsp_init(c,avctx);
4286
#endif
4287
#if CONFIG_RV40_DECODER
4288
    ff_rv40dsp_init(c,avctx);
4289
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4290
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4291
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4292
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4293
#endif
4294

    
4295
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4296
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4297
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4298
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;