Statistics
| Branch: | Revision:

ffmpeg / libavcodec / dsputil.c @ 8dffcca5

History | View | Annotate | Download (108 KB)

1
/*
2
 * DSP utils
3
 * Copyright (c) 2000, 2001 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6
 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7
 *
8
 * This file is part of FFmpeg.
9
 *
10
 * FFmpeg is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU Lesser General Public
12
 * License as published by the Free Software Foundation; either
13
 * version 2.1 of the License, or (at your option) any later version.
14
 *
15
 * FFmpeg is distributed in the hope that it will be useful,
16
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 * Lesser General Public License for more details.
19
 *
20
 * You should have received a copy of the GNU Lesser General Public
21
 * License along with FFmpeg; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
 */
24

    
25
/**
26
 * @file
27
 * DSP utils
28
 */
29

    
30
#include "libavutil/imgutils.h"
31
#include "avcodec.h"
32
#include "dsputil.h"
33
#include "simple_idct.h"
34
#include "faandct.h"
35
#include "faanidct.h"
36
#include "mathops.h"
37
#include "mpegvideo.h"
38
#include "config.h"
39
#include "ac3dec.h"
40
#include "vorbis.h"
41
#include "png.h"
42

    
43
uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44
uint32_t ff_squareTbl[512] = {0, };
45

    
46
#include "dsputil_internal.h"
47

    
48
// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
49
#define pb_7f (~0UL/255 * 0x7f)
50
#define pb_80 (~0UL/255 * 0x80)
51

    
52
const uint8_t ff_zigzag_direct[64] = {
53
    0,   1,  8, 16,  9,  2,  3, 10,
54
    17, 24, 32, 25, 18, 11,  4,  5,
55
    12, 19, 26, 33, 40, 48, 41, 34,
56
    27, 20, 13,  6,  7, 14, 21, 28,
57
    35, 42, 49, 56, 57, 50, 43, 36,
58
    29, 22, 15, 23, 30, 37, 44, 51,
59
    58, 59, 52, 45, 38, 31, 39, 46,
60
    53, 60, 61, 54, 47, 55, 62, 63
61
};
62

    
63
/* Specific zigzag scan for 248 idct. NOTE that unlike the
64
   specification, we interleave the fields */
65
const uint8_t ff_zigzag248_direct[64] = {
66
     0,  8,  1,  9, 16, 24,  2, 10,
67
    17, 25, 32, 40, 48, 56, 33, 41,
68
    18, 26,  3, 11,  4, 12, 19, 27,
69
    34, 42, 49, 57, 50, 58, 35, 43,
70
    20, 28,  5, 13,  6, 14, 21, 29,
71
    36, 44, 51, 59, 52, 60, 37, 45,
72
    22, 30,  7, 15, 23, 31, 38, 46,
73
    53, 61, 54, 62, 39, 47, 55, 63,
74
};
75

    
76
/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
77
DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
78

    
79
const uint8_t ff_alternate_horizontal_scan[64] = {
80
    0,  1,   2,  3,  8,  9, 16, 17,
81
    10, 11,  4,  5,  6,  7, 15, 14,
82
    13, 12, 19, 18, 24, 25, 32, 33,
83
    26, 27, 20, 21, 22, 23, 28, 29,
84
    30, 31, 34, 35, 40, 41, 48, 49,
85
    42, 43, 36, 37, 38, 39, 44, 45,
86
    46, 47, 50, 51, 56, 57, 58, 59,
87
    52, 53, 54, 55, 60, 61, 62, 63,
88
};
89

    
90
const uint8_t ff_alternate_vertical_scan[64] = {
91
    0,  8,  16, 24,  1,  9,  2, 10,
92
    17, 25, 32, 40, 48, 56, 57, 49,
93
    41, 33, 26, 18,  3, 11,  4, 12,
94
    19, 27, 34, 42, 50, 58, 35, 43,
95
    51, 59, 20, 28,  5, 13,  6, 14,
96
    21, 29, 36, 44, 52, 60, 37, 45,
97
    53, 61, 22, 30,  7, 15, 23, 31,
98
    38, 46, 54, 62, 39, 47, 55, 63,
99
};
100

    
101
/* Input permutation for the simple_idct_mmx */
102
static const uint8_t simple_mmx_permutation[64]={
103
        0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
104
        0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
105
        0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
106
        0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
107
        0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
108
        0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
109
        0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
110
        0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111
};
112

    
113
static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
114

    
115
void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
116
    int i;
117
    int end;
118

    
119
    st->scantable= src_scantable;
120

    
121
    for(i=0; i<64; i++){
122
        int j;
123
        j = src_scantable[i];
124
        st->permutated[i] = permutation[j];
125
#if ARCH_PPC
126
        st->inverse[j] = i;
127
#endif
128
    }
129

    
130
    end=-1;
131
    for(i=0; i<64; i++){
132
        int j;
133
        j = st->permutated[i];
134
        if(j>end) end=j;
135
        st->raster_end[i]= end;
136
    }
137
}
138

    
139
static int pix_sum_c(uint8_t * pix, int line_size)
140
{
141
    int s, i, j;
142

    
143
    s = 0;
144
    for (i = 0; i < 16; i++) {
145
        for (j = 0; j < 16; j += 8) {
146
            s += pix[0];
147
            s += pix[1];
148
            s += pix[2];
149
            s += pix[3];
150
            s += pix[4];
151
            s += pix[5];
152
            s += pix[6];
153
            s += pix[7];
154
            pix += 8;
155
        }
156
        pix += line_size - 16;
157
    }
158
    return s;
159
}
160

    
161
static int pix_norm1_c(uint8_t * pix, int line_size)
162
{
163
    int s, i, j;
164
    uint32_t *sq = ff_squareTbl + 256;
165

    
166
    s = 0;
167
    for (i = 0; i < 16; i++) {
168
        for (j = 0; j < 16; j += 8) {
169
#if 0
170
            s += sq[pix[0]];
171
            s += sq[pix[1]];
172
            s += sq[pix[2]];
173
            s += sq[pix[3]];
174
            s += sq[pix[4]];
175
            s += sq[pix[5]];
176
            s += sq[pix[6]];
177
            s += sq[pix[7]];
178
#else
179
#if LONG_MAX > 2147483647
180
            register uint64_t x=*(uint64_t*)pix;
181
            s += sq[x&0xff];
182
            s += sq[(x>>8)&0xff];
183
            s += sq[(x>>16)&0xff];
184
            s += sq[(x>>24)&0xff];
185
            s += sq[(x>>32)&0xff];
186
            s += sq[(x>>40)&0xff];
187
            s += sq[(x>>48)&0xff];
188
            s += sq[(x>>56)&0xff];
189
#else
190
            register uint32_t x=*(uint32_t*)pix;
191
            s += sq[x&0xff];
192
            s += sq[(x>>8)&0xff];
193
            s += sq[(x>>16)&0xff];
194
            s += sq[(x>>24)&0xff];
195
            x=*(uint32_t*)(pix+4);
196
            s += sq[x&0xff];
197
            s += sq[(x>>8)&0xff];
198
            s += sq[(x>>16)&0xff];
199
            s += sq[(x>>24)&0xff];
200
#endif
201
#endif
202
            pix += 8;
203
        }
204
        pix += line_size - 16;
205
    }
206
    return s;
207
}
208

    
209
static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210
    int i;
211

    
212
    for(i=0; i+8<=w; i+=8){
213
        dst[i+0]= av_bswap32(src[i+0]);
214
        dst[i+1]= av_bswap32(src[i+1]);
215
        dst[i+2]= av_bswap32(src[i+2]);
216
        dst[i+3]= av_bswap32(src[i+3]);
217
        dst[i+4]= av_bswap32(src[i+4]);
218
        dst[i+5]= av_bswap32(src[i+5]);
219
        dst[i+6]= av_bswap32(src[i+6]);
220
        dst[i+7]= av_bswap32(src[i+7]);
221
    }
222
    for(;i<w; i++){
223
        dst[i+0]= av_bswap32(src[i+0]);
224
    }
225
}
226

    
227
static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
228
{
229
    while (len--)
230
        *dst++ = av_bswap16(*src++);
231
}
232

    
233
static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234
{
235
    int s, i;
236
    uint32_t *sq = ff_squareTbl + 256;
237

    
238
    s = 0;
239
    for (i = 0; i < h; i++) {
240
        s += sq[pix1[0] - pix2[0]];
241
        s += sq[pix1[1] - pix2[1]];
242
        s += sq[pix1[2] - pix2[2]];
243
        s += sq[pix1[3] - pix2[3]];
244
        pix1 += line_size;
245
        pix2 += line_size;
246
    }
247
    return s;
248
}
249

    
250
static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251
{
252
    int s, i;
253
    uint32_t *sq = ff_squareTbl + 256;
254

    
255
    s = 0;
256
    for (i = 0; i < h; i++) {
257
        s += sq[pix1[0] - pix2[0]];
258
        s += sq[pix1[1] - pix2[1]];
259
        s += sq[pix1[2] - pix2[2]];
260
        s += sq[pix1[3] - pix2[3]];
261
        s += sq[pix1[4] - pix2[4]];
262
        s += sq[pix1[5] - pix2[5]];
263
        s += sq[pix1[6] - pix2[6]];
264
        s += sq[pix1[7] - pix2[7]];
265
        pix1 += line_size;
266
        pix2 += line_size;
267
    }
268
    return s;
269
}
270

    
271
static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272
{
273
    int s, i;
274
    uint32_t *sq = ff_squareTbl + 256;
275

    
276
    s = 0;
277
    for (i = 0; i < h; i++) {
278
        s += sq[pix1[ 0] - pix2[ 0]];
279
        s += sq[pix1[ 1] - pix2[ 1]];
280
        s += sq[pix1[ 2] - pix2[ 2]];
281
        s += sq[pix1[ 3] - pix2[ 3]];
282
        s += sq[pix1[ 4] - pix2[ 4]];
283
        s += sq[pix1[ 5] - pix2[ 5]];
284
        s += sq[pix1[ 6] - pix2[ 6]];
285
        s += sq[pix1[ 7] - pix2[ 7]];
286
        s += sq[pix1[ 8] - pix2[ 8]];
287
        s += sq[pix1[ 9] - pix2[ 9]];
288
        s += sq[pix1[10] - pix2[10]];
289
        s += sq[pix1[11] - pix2[11]];
290
        s += sq[pix1[12] - pix2[12]];
291
        s += sq[pix1[13] - pix2[13]];
292
        s += sq[pix1[14] - pix2[14]];
293
        s += sq[pix1[15] - pix2[15]];
294

    
295
        pix1 += line_size;
296
        pix2 += line_size;
297
    }
298
    return s;
299
}
300

    
301
static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
302
{
303
    int i;
304

    
305
    /* read the pixels */
306
    for(i=0;i<8;i++) {
307
        block[0] = pixels[0];
308
        block[1] = pixels[1];
309
        block[2] = pixels[2];
310
        block[3] = pixels[3];
311
        block[4] = pixels[4];
312
        block[5] = pixels[5];
313
        block[6] = pixels[6];
314
        block[7] = pixels[7];
315
        pixels += line_size;
316
        block += 8;
317
    }
318
}
319

    
320
static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
321
                          const uint8_t *s2, int stride){
322
    int i;
323

    
324
    /* read the pixels */
325
    for(i=0;i<8;i++) {
326
        block[0] = s1[0] - s2[0];
327
        block[1] = s1[1] - s2[1];
328
        block[2] = s1[2] - s2[2];
329
        block[3] = s1[3] - s2[3];
330
        block[4] = s1[4] - s2[4];
331
        block[5] = s1[5] - s2[5];
332
        block[6] = s1[6] - s2[6];
333
        block[7] = s1[7] - s2[7];
334
        s1 += stride;
335
        s2 += stride;
336
        block += 8;
337
    }
338
}
339

    
340

    
341
void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
342
                             int line_size)
343
{
344
    int i;
345
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
346

    
347
    /* read the pixels */
348
    for(i=0;i<8;i++) {
349
        pixels[0] = cm[block[0]];
350
        pixels[1] = cm[block[1]];
351
        pixels[2] = cm[block[2]];
352
        pixels[3] = cm[block[3]];
353
        pixels[4] = cm[block[4]];
354
        pixels[5] = cm[block[5]];
355
        pixels[6] = cm[block[6]];
356
        pixels[7] = cm[block[7]];
357

    
358
        pixels += line_size;
359
        block += 8;
360
    }
361
}
362

    
363
static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
364
                                 int line_size)
365
{
366
    int i;
367
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
368

    
369
    /* read the pixels */
370
    for(i=0;i<4;i++) {
371
        pixels[0] = cm[block[0]];
372
        pixels[1] = cm[block[1]];
373
        pixels[2] = cm[block[2]];
374
        pixels[3] = cm[block[3]];
375

    
376
        pixels += line_size;
377
        block += 8;
378
    }
379
}
380

    
381
static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
382
                                 int line_size)
383
{
384
    int i;
385
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
386

    
387
    /* read the pixels */
388
    for(i=0;i<2;i++) {
389
        pixels[0] = cm[block[0]];
390
        pixels[1] = cm[block[1]];
391

    
392
        pixels += line_size;
393
        block += 8;
394
    }
395
}
396

    
397
void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
398
                                    uint8_t *restrict pixels,
399
                                    int line_size)
400
{
401
    int i, j;
402

    
403
    for (i = 0; i < 8; i++) {
404
        for (j = 0; j < 8; j++) {
405
            if (*block < -128)
406
                *pixels = 0;
407
            else if (*block > 127)
408
                *pixels = 255;
409
            else
410
                *pixels = (uint8_t)(*block + 128);
411
            block++;
412
            pixels++;
413
        }
414
        pixels += (line_size - 8);
415
    }
416
}
417

    
418
static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
419
                                    int line_size)
420
{
421
    int i;
422

    
423
    /* read the pixels */
424
    for(i=0;i<8;i++) {
425
        pixels[0] = block[0];
426
        pixels[1] = block[1];
427
        pixels[2] = block[2];
428
        pixels[3] = block[3];
429
        pixels[4] = block[4];
430
        pixels[5] = block[5];
431
        pixels[6] = block[6];
432
        pixels[7] = block[7];
433

    
434
        pixels += line_size;
435
        block += 8;
436
    }
437
}
438

    
439
void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
440
                             int line_size)
441
{
442
    int i;
443
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
444

    
445
    /* read the pixels */
446
    for(i=0;i<8;i++) {
447
        pixels[0] = cm[pixels[0] + block[0]];
448
        pixels[1] = cm[pixels[1] + block[1]];
449
        pixels[2] = cm[pixels[2] + block[2]];
450
        pixels[3] = cm[pixels[3] + block[3]];
451
        pixels[4] = cm[pixels[4] + block[4]];
452
        pixels[5] = cm[pixels[5] + block[5]];
453
        pixels[6] = cm[pixels[6] + block[6]];
454
        pixels[7] = cm[pixels[7] + block[7]];
455
        pixels += line_size;
456
        block += 8;
457
    }
458
}
459

    
460
static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461
                          int line_size)
462
{
463
    int i;
464
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
465

    
466
    /* read the pixels */
467
    for(i=0;i<4;i++) {
468
        pixels[0] = cm[pixels[0] + block[0]];
469
        pixels[1] = cm[pixels[1] + block[1]];
470
        pixels[2] = cm[pixels[2] + block[2]];
471
        pixels[3] = cm[pixels[3] + block[3]];
472
        pixels += line_size;
473
        block += 8;
474
    }
475
}
476

    
477
static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478
                          int line_size)
479
{
480
    int i;
481
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
482

    
483
    /* read the pixels */
484
    for(i=0;i<2;i++) {
485
        pixels[0] = cm[pixels[0] + block[0]];
486
        pixels[1] = cm[pixels[1] + block[1]];
487
        pixels += line_size;
488
        block += 8;
489
    }
490
}
491

    
492
static int sum_abs_dctelem_c(DCTELEM *block)
493
{
494
    int sum=0, i;
495
    for(i=0; i<64; i++)
496
        sum+= FFABS(block[i]);
497
    return sum;
498
}
499

    
500
static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
501
{
502
    int i;
503

    
504
    for (i = 0; i < h; i++) {
505
        memset(block, value, 16);
506
        block += line_size;
507
    }
508
}
509

    
510
static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
511
{
512
    int i;
513

    
514
    for (i = 0; i < h; i++) {
515
        memset(block, value, 8);
516
        block += line_size;
517
    }
518
}
519

    
520
static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
521
{
522
    int i, j;
523
    uint16_t *dst1 = (uint16_t *) dst;
524
    uint16_t *dst2 = (uint16_t *)(dst + linesize);
525

    
526
    for (j = 0; j < 8; j++) {
527
        for (i = 0; i < 8; i++) {
528
            dst1[i] = dst2[i] = src[i] * 0x0101;
529
        }
530
        src  += 8;
531
        dst1 += linesize;
532
        dst2 += linesize;
533
    }
534
}
535

    
536
#define avg2(a,b) ((a+b+1)>>1)
537
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
538

    
539
static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
540
{
541
    const int A=(16-x16)*(16-y16);
542
    const int B=(   x16)*(16-y16);
543
    const int C=(16-x16)*(   y16);
544
    const int D=(   x16)*(   y16);
545
    int i;
546

    
547
    for(i=0; i<h; i++)
548
    {
549
        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
550
        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
551
        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
552
        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
553
        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
554
        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
555
        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
556
        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
557
        dst+= stride;
558
        src+= stride;
559
    }
560
}
561

    
562
void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
563
                  int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
564
{
565
    int y, vx, vy;
566
    const int s= 1<<shift;
567

    
568
    width--;
569
    height--;
570

    
571
    for(y=0; y<h; y++){
572
        int x;
573

    
574
        vx= ox;
575
        vy= oy;
576
        for(x=0; x<8; x++){ //XXX FIXME optimize
577
            int src_x, src_y, frac_x, frac_y, index;
578

    
579
            src_x= vx>>16;
580
            src_y= vy>>16;
581
            frac_x= src_x&(s-1);
582
            frac_y= src_y&(s-1);
583
            src_x>>=shift;
584
            src_y>>=shift;
585

    
586
            if((unsigned)src_x < width){
587
                if((unsigned)src_y < height){
588
                    index= src_x + src_y*stride;
589
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
590
                                           + src[index       +1]*   frac_x )*(s-frac_y)
591
                                        + (  src[index+stride  ]*(s-frac_x)
592
                                           + src[index+stride+1]*   frac_x )*   frac_y
593
                                        + r)>>(shift*2);
594
                }else{
595
                    index= src_x + av_clip(src_y, 0, height)*stride;
596
                    dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
597
                                          + src[index       +1]*   frac_x )*s
598
                                        + r)>>(shift*2);
599
                }
600
            }else{
601
                if((unsigned)src_y < height){
602
                    index= av_clip(src_x, 0, width) + src_y*stride;
603
                    dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
604
                                           + src[index+stride  ]*   frac_y )*s
605
                                        + r)>>(shift*2);
606
                }else{
607
                    index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
608
                    dst[y*stride + x]=    src[index         ];
609
                }
610
            }
611

    
612
            vx+= dxx;
613
            vy+= dyx;
614
        }
615
        ox += dxy;
616
        oy += dyy;
617
    }
618
}
619

    
620
static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
621
    switch(width){
622
    case 2: put_pixels2_c (dst, src, stride, height); break;
623
    case 4: put_pixels4_c (dst, src, stride, height); break;
624
    case 8: put_pixels8_c (dst, src, stride, height); break;
625
    case 16:put_pixels16_c(dst, src, stride, height); break;
626
    }
627
}
628

    
629
static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630
    int i,j;
631
    for (i=0; i < height; i++) {
632
      for (j=0; j < width; j++) {
633
        dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
634
      }
635
      src += stride;
636
      dst += stride;
637
    }
638
}
639

    
640
static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
641
    int i,j;
642
    for (i=0; i < height; i++) {
643
      for (j=0; j < width; j++) {
644
        dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
645
      }
646
      src += stride;
647
      dst += stride;
648
    }
649
}
650

    
651
static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
652
    int i,j;
653
    for (i=0; i < height; i++) {
654
      for (j=0; j < width; j++) {
655
        dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
656
      }
657
      src += stride;
658
      dst += stride;
659
    }
660
}
661

    
662
static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
663
    int i,j;
664
    for (i=0; i < height; i++) {
665
      for (j=0; j < width; j++) {
666
        dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
667
      }
668
      src += stride;
669
      dst += stride;
670
    }
671
}
672

    
673
static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
674
    int i,j;
675
    for (i=0; i < height; i++) {
676
      for (j=0; j < width; j++) {
677
        dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
678
      }
679
      src += stride;
680
      dst += stride;
681
    }
682
}
683

    
684
static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
685
    int i,j;
686
    for (i=0; i < height; i++) {
687
      for (j=0; j < width; j++) {
688
        dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
689
      }
690
      src += stride;
691
      dst += stride;
692
    }
693
}
694

    
695
static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
696
    int i,j;
697
    for (i=0; i < height; i++) {
698
      for (j=0; j < width; j++) {
699
        dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
700
      }
701
      src += stride;
702
      dst += stride;
703
    }
704
}
705

    
706
static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
707
    int i,j;
708
    for (i=0; i < height; i++) {
709
      for (j=0; j < width; j++) {
710
        dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
711
      }
712
      src += stride;
713
      dst += stride;
714
    }
715
}
716

    
717
static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
718
    switch(width){
719
    case 2: avg_pixels2_c (dst, src, stride, height); break;
720
    case 4: avg_pixels4_c (dst, src, stride, height); break;
721
    case 8: avg_pixels8_c (dst, src, stride, height); break;
722
    case 16:avg_pixels16_c(dst, src, stride, height); break;
723
    }
724
}
725

    
726
static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727
    int i,j;
728
    for (i=0; i < height; i++) {
729
      for (j=0; j < width; j++) {
730
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
731
      }
732
      src += stride;
733
      dst += stride;
734
    }
735
}
736

    
737
static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
738
    int i,j;
739
    for (i=0; i < height; i++) {
740
      for (j=0; j < width; j++) {
741
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
742
      }
743
      src += stride;
744
      dst += stride;
745
    }
746
}
747

    
748
static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
749
    int i,j;
750
    for (i=0; i < height; i++) {
751
      for (j=0; j < width; j++) {
752
        dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
753
      }
754
      src += stride;
755
      dst += stride;
756
    }
757
}
758

    
759
static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
760
    int i,j;
761
    for (i=0; i < height; i++) {
762
      for (j=0; j < width; j++) {
763
        dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
764
      }
765
      src += stride;
766
      dst += stride;
767
    }
768
}
769

    
770
static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
771
    int i,j;
772
    for (i=0; i < height; i++) {
773
      for (j=0; j < width; j++) {
774
        dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
775
      }
776
      src += stride;
777
      dst += stride;
778
    }
779
}
780

    
781
static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
782
    int i,j;
783
    for (i=0; i < height; i++) {
784
      for (j=0; j < width; j++) {
785
        dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
786
      }
787
      src += stride;
788
      dst += stride;
789
    }
790
}
791

    
792
static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
793
    int i,j;
794
    for (i=0; i < height; i++) {
795
      for (j=0; j < width; j++) {
796
        dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
797
      }
798
      src += stride;
799
      dst += stride;
800
    }
801
}
802

    
803
static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
804
    int i,j;
805
    for (i=0; i < height; i++) {
806
      for (j=0; j < width; j++) {
807
        dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
808
      }
809
      src += stride;
810
      dst += stride;
811
    }
812
}
813
#if 0
814
#define TPEL_WIDTH(width)\
815
static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
816
    void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
817
static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
818
    void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
819
static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
820
    void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
821
static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
822
    void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
823
static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
824
    void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
825
static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
826
    void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
827
static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
828
    void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
829
static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
830
    void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
831
static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
832
    void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
833
#endif
834

    
835
#define QPEL_MC(r, OPNAME, RND, OP) \
836
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
837
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
838
    int i;\
839
    for(i=0; i<h; i++)\
840
    {\
841
        OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
842
        OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
843
        OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
844
        OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
845
        OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
846
        OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
847
        OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
848
        OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
849
        dst+=dstStride;\
850
        src+=srcStride;\
851
    }\
852
}\
853
\
854
static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
855
    const int w=8;\
856
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
857
    int i;\
858
    for(i=0; i<w; i++)\
859
    {\
860
        const int src0= src[0*srcStride];\
861
        const int src1= src[1*srcStride];\
862
        const int src2= src[2*srcStride];\
863
        const int src3= src[3*srcStride];\
864
        const int src4= src[4*srcStride];\
865
        const int src5= src[5*srcStride];\
866
        const int src6= src[6*srcStride];\
867
        const int src7= src[7*srcStride];\
868
        const int src8= src[8*srcStride];\
869
        OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
870
        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
871
        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
872
        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
873
        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
874
        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
875
        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
876
        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
877
        dst++;\
878
        src++;\
879
    }\
880
}\
881
\
882
static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
883
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
884
    int i;\
885
    \
886
    for(i=0; i<h; i++)\
887
    {\
888
        OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
889
        OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
890
        OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
891
        OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
892
        OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
893
        OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
894
        OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
895
        OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
896
        OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
897
        OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
898
        OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
899
        OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
900
        OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
901
        OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
902
        OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
903
        OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
904
        dst+=dstStride;\
905
        src+=srcStride;\
906
    }\
907
}\
908
\
909
static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
910
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
911
    int i;\
912
    const int w=16;\
913
    for(i=0; i<w; i++)\
914
    {\
915
        const int src0= src[0*srcStride];\
916
        const int src1= src[1*srcStride];\
917
        const int src2= src[2*srcStride];\
918
        const int src3= src[3*srcStride];\
919
        const int src4= src[4*srcStride];\
920
        const int src5= src[5*srcStride];\
921
        const int src6= src[6*srcStride];\
922
        const int src7= src[7*srcStride];\
923
        const int src8= src[8*srcStride];\
924
        const int src9= src[9*srcStride];\
925
        const int src10= src[10*srcStride];\
926
        const int src11= src[11*srcStride];\
927
        const int src12= src[12*srcStride];\
928
        const int src13= src[13*srcStride];\
929
        const int src14= src[14*srcStride];\
930
        const int src15= src[15*srcStride];\
931
        const int src16= src[16*srcStride];\
932
        OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
933
        OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
934
        OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
935
        OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
936
        OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
937
        OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
938
        OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
939
        OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
940
        OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
941
        OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
942
        OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
943
        OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
944
        OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
945
        OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
946
        OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
947
        OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
948
        dst++;\
949
        src++;\
950
    }\
951
}\
952
\
953
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
954
    uint8_t half[64];\
955
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
956
    OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
957
}\
958
\
959
static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
960
    OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
961
}\
962
\
963
static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
964
    uint8_t half[64];\
965
    put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
966
    OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
967
}\
968
\
969
static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
970
    uint8_t full[16*9];\
971
    uint8_t half[64];\
972
    copy_block9(full, src, 16, stride, 9);\
973
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
974
    OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
975
}\
976
\
977
static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
978
    uint8_t full[16*9];\
979
    copy_block9(full, src, 16, stride, 9);\
980
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
981
}\
982
\
983
static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
984
    uint8_t full[16*9];\
985
    uint8_t half[64];\
986
    copy_block9(full, src, 16, stride, 9);\
987
    put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
988
    OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
989
}\
990
void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
991
    uint8_t full[16*9];\
992
    uint8_t halfH[72];\
993
    uint8_t halfV[64];\
994
    uint8_t halfHV[64];\
995
    copy_block9(full, src, 16, stride, 9);\
996
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
998
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
999
    OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1000
}\
1001
static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1002
    uint8_t full[16*9];\
1003
    uint8_t halfH[72];\
1004
    uint8_t halfHV[64];\
1005
    copy_block9(full, src, 16, stride, 9);\
1006
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1008
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1010
}\
1011
void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1012
    uint8_t full[16*9];\
1013
    uint8_t halfH[72];\
1014
    uint8_t halfV[64];\
1015
    uint8_t halfHV[64];\
1016
    copy_block9(full, src, 16, stride, 9);\
1017
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1018
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1019
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020
    OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1021
}\
1022
static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1023
    uint8_t full[16*9];\
1024
    uint8_t halfH[72];\
1025
    uint8_t halfHV[64];\
1026
    copy_block9(full, src, 16, stride, 9);\
1027
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1028
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1029
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1030
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1031
}\
1032
void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1033
    uint8_t full[16*9];\
1034
    uint8_t halfH[72];\
1035
    uint8_t halfV[64];\
1036
    uint8_t halfHV[64];\
1037
    copy_block9(full, src, 16, stride, 9);\
1038
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1039
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1040
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1041
    OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1042
}\
1043
static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1044
    uint8_t full[16*9];\
1045
    uint8_t halfH[72];\
1046
    uint8_t halfHV[64];\
1047
    copy_block9(full, src, 16, stride, 9);\
1048
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1049
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1050
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1051
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1052
}\
1053
void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1054
    uint8_t full[16*9];\
1055
    uint8_t halfH[72];\
1056
    uint8_t halfV[64];\
1057
    uint8_t halfHV[64];\
1058
    copy_block9(full, src, 16, stride, 9);\
1059
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1060
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1061
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1062
    OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1063
}\
1064
static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1065
    uint8_t full[16*9];\
1066
    uint8_t halfH[72];\
1067
    uint8_t halfHV[64];\
1068
    copy_block9(full, src, 16, stride, 9);\
1069
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1070
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1071
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1072
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1073
}\
1074
static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1075
    uint8_t halfH[72];\
1076
    uint8_t halfHV[64];\
1077
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1078
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1079
    OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1080
}\
1081
static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1082
    uint8_t halfH[72];\
1083
    uint8_t halfHV[64];\
1084
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1085
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1086
    OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1087
}\
1088
void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1089
    uint8_t full[16*9];\
1090
    uint8_t halfH[72];\
1091
    uint8_t halfV[64];\
1092
    uint8_t halfHV[64];\
1093
    copy_block9(full, src, 16, stride, 9);\
1094
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1095
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1096
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1097
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1098
}\
1099
static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1100
    uint8_t full[16*9];\
1101
    uint8_t halfH[72];\
1102
    copy_block9(full, src, 16, stride, 9);\
1103
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104
    put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1105
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1106
}\
1107
void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1108
    uint8_t full[16*9];\
1109
    uint8_t halfH[72];\
1110
    uint8_t halfV[64];\
1111
    uint8_t halfHV[64];\
1112
    copy_block9(full, src, 16, stride, 9);\
1113
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1114
    put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1115
    put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1116
    OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1117
}\
1118
static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1119
    uint8_t full[16*9];\
1120
    uint8_t halfH[72];\
1121
    copy_block9(full, src, 16, stride, 9);\
1122
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123
    put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1124
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1125
}\
1126
static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1127
    uint8_t halfH[72];\
1128
    put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1129
    OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1130
}\
1131
\
1132
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1133
    uint8_t half[256];\
1134
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135
    OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1136
}\
1137
\
1138
static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1139
    OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1140
}\
1141
\
1142
static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1143
    uint8_t half[256];\
1144
    put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1145
    OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1146
}\
1147
\
1148
static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1149
    uint8_t full[24*17];\
1150
    uint8_t half[256];\
1151
    copy_block17(full, src, 24, stride, 17);\
1152
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1153
    OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1154
}\
1155
\
1156
static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1157
    uint8_t full[24*17];\
1158
    copy_block17(full, src, 24, stride, 17);\
1159
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1160
}\
1161
\
1162
static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1163
    uint8_t full[24*17];\
1164
    uint8_t half[256];\
1165
    copy_block17(full, src, 24, stride, 17);\
1166
    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1167
    OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1168
}\
1169
void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1170
    uint8_t full[24*17];\
1171
    uint8_t halfH[272];\
1172
    uint8_t halfV[256];\
1173
    uint8_t halfHV[256];\
1174
    copy_block17(full, src, 24, stride, 17);\
1175
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1177
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178
    OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1179
}\
1180
static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1181
    uint8_t full[24*17];\
1182
    uint8_t halfH[272];\
1183
    uint8_t halfHV[256];\
1184
    copy_block17(full, src, 24, stride, 17);\
1185
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1187
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1189
}\
1190
void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1191
    uint8_t full[24*17];\
1192
    uint8_t halfH[272];\
1193
    uint8_t halfV[256];\
1194
    uint8_t halfHV[256];\
1195
    copy_block17(full, src, 24, stride, 17);\
1196
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1197
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1198
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199
    OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1200
}\
1201
static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1202
    uint8_t full[24*17];\
1203
    uint8_t halfH[272];\
1204
    uint8_t halfHV[256];\
1205
    copy_block17(full, src, 24, stride, 17);\
1206
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1207
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1208
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1209
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1210
}\
1211
void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1212
    uint8_t full[24*17];\
1213
    uint8_t halfH[272];\
1214
    uint8_t halfV[256];\
1215
    uint8_t halfHV[256];\
1216
    copy_block17(full, src, 24, stride, 17);\
1217
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1218
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1219
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1220
    OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1221
}\
1222
static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1223
    uint8_t full[24*17];\
1224
    uint8_t halfH[272];\
1225
    uint8_t halfHV[256];\
1226
    copy_block17(full, src, 24, stride, 17);\
1227
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1228
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1229
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1230
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1231
}\
1232
void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1233
    uint8_t full[24*17];\
1234
    uint8_t halfH[272];\
1235
    uint8_t halfV[256];\
1236
    uint8_t halfHV[256];\
1237
    copy_block17(full, src, 24, stride, 17);\
1238
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1239
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1240
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241
    OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1242
}\
1243
static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1244
    uint8_t full[24*17];\
1245
    uint8_t halfH[272];\
1246
    uint8_t halfHV[256];\
1247
    copy_block17(full, src, 24, stride, 17);\
1248
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1249
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1250
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1251
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1252
}\
1253
static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1254
    uint8_t halfH[272];\
1255
    uint8_t halfHV[256];\
1256
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1257
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1258
    OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1259
}\
1260
static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1261
    uint8_t halfH[272];\
1262
    uint8_t halfHV[256];\
1263
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265
    OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1266
}\
1267
void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1268
    uint8_t full[24*17];\
1269
    uint8_t halfH[272];\
1270
    uint8_t halfV[256];\
1271
    uint8_t halfHV[256];\
1272
    copy_block17(full, src, 24, stride, 17);\
1273
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1274
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1275
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1276
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1277
}\
1278
static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1279
    uint8_t full[24*17];\
1280
    uint8_t halfH[272];\
1281
    copy_block17(full, src, 24, stride, 17);\
1282
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283
    put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1284
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1285
}\
1286
void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1287
    uint8_t full[24*17];\
1288
    uint8_t halfH[272];\
1289
    uint8_t halfV[256];\
1290
    uint8_t halfHV[256];\
1291
    copy_block17(full, src, 24, stride, 17);\
1292
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1293
    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1294
    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1295
    OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1296
}\
1297
static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1298
    uint8_t full[24*17];\
1299
    uint8_t halfH[272];\
1300
    copy_block17(full, src, 24, stride, 17);\
1301
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302
    put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1303
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1304
}\
1305
static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1306
    uint8_t halfH[272];\
1307
    put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1308
    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1309
}
1310

    
1311
#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1312
#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1313
#define op_put(a, b) a = cm[((b) + 16)>>5]
1314
#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1315

    
1316
QPEL_MC(0, put_       , _       , op_put)
1317
QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1318
QPEL_MC(0, avg_       , _       , op_avg)
1319
//QPEL_MC(1, avg_no_rnd , _       , op_avg)
1320
#undef op_avg
1321
#undef op_avg_no_rnd
1322
#undef op_put
1323
#undef op_put_no_rnd
1324

    
1325
#define put_qpel8_mc00_c  ff_put_pixels8x8_c
1326
#define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
1327
#define put_qpel16_mc00_c ff_put_pixels16x16_c
1328
#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1329
#define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1330
#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1331

    
1332
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1333
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1334
    int i;
1335

    
1336
    for(i=0; i<h; i++){
1337
        dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1338
        dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1339
        dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1340
        dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1341
        dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1342
        dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1343
        dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1344
        dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1345
        dst+=dstStride;
1346
        src+=srcStride;
1347
    }
1348
}
1349

    
1350
#if CONFIG_RV40_DECODER
1351
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1352
    put_pixels16_xy2_c(dst, src, stride, 16);
1353
}
1354
static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1355
    avg_pixels16_xy2_c(dst, src, stride, 16);
1356
}
1357
static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1358
    put_pixels8_xy2_c(dst, src, stride, 8);
1359
}
1360
static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361
    avg_pixels8_xy2_c(dst, src, stride, 8);
1362
}
1363
#endif /* CONFIG_RV40_DECODER */
1364

    
1365
static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1366
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1367
    int i;
1368

    
1369
    for(i=0; i<w; i++){
1370
        const int src_1= src[ -srcStride];
1371
        const int src0 = src[0          ];
1372
        const int src1 = src[  srcStride];
1373
        const int src2 = src[2*srcStride];
1374
        const int src3 = src[3*srcStride];
1375
        const int src4 = src[4*srcStride];
1376
        const int src5 = src[5*srcStride];
1377
        const int src6 = src[6*srcStride];
1378
        const int src7 = src[7*srcStride];
1379
        const int src8 = src[8*srcStride];
1380
        const int src9 = src[9*srcStride];
1381
        dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1382
        dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
1383
        dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
1384
        dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
1385
        dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
1386
        dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
1387
        dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
1388
        dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
1389
        src++;
1390
        dst++;
1391
    }
1392
}
1393

    
1394
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1395
    uint8_t half[64];
1396
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1397
    put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1398
}
1399

    
1400
static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1401
    wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1402
}
1403

    
1404
static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1405
    uint8_t half[64];
1406
    wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1407
    put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1408
}
1409

    
1410
static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1411
    wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1412
}
1413

    
1414
static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1415
    uint8_t halfH[88];
1416
    uint8_t halfV[64];
1417
    uint8_t halfHV[64];
1418
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1419
    wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1420
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1421
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1422
}
1423
static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1424
    uint8_t halfH[88];
1425
    uint8_t halfV[64];
1426
    uint8_t halfHV[64];
1427
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428
    wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1429
    wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430
    put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1431
}
1432
static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1433
    uint8_t halfH[88];
1434
    wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1435
    wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1436
}
1437

    
1438
static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1439
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1440
    int x;
1441
    const int strength= ff_h263_loop_filter_strength[qscale];
1442

    
1443
    for(x=0; x<8; x++){
1444
        int d1, d2, ad1;
1445
        int p0= src[x-2*stride];
1446
        int p1= src[x-1*stride];
1447
        int p2= src[x+0*stride];
1448
        int p3= src[x+1*stride];
1449
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1450

    
1451
        if     (d<-2*strength) d1= 0;
1452
        else if(d<-  strength) d1=-2*strength - d;
1453
        else if(d<   strength) d1= d;
1454
        else if(d< 2*strength) d1= 2*strength - d;
1455
        else                   d1= 0;
1456

    
1457
        p1 += d1;
1458
        p2 -= d1;
1459
        if(p1&256) p1= ~(p1>>31);
1460
        if(p2&256) p2= ~(p2>>31);
1461

    
1462
        src[x-1*stride] = p1;
1463
        src[x+0*stride] = p2;
1464

    
1465
        ad1= FFABS(d1)>>1;
1466

    
1467
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1468

    
1469
        src[x-2*stride] = p0 - d2;
1470
        src[x+  stride] = p3 + d2;
1471
    }
1472
    }
1473
}
1474

    
1475
static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1476
    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1477
    int y;
1478
    const int strength= ff_h263_loop_filter_strength[qscale];
1479

    
1480
    for(y=0; y<8; y++){
1481
        int d1, d2, ad1;
1482
        int p0= src[y*stride-2];
1483
        int p1= src[y*stride-1];
1484
        int p2= src[y*stride+0];
1485
        int p3= src[y*stride+1];
1486
        int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1487

    
1488
        if     (d<-2*strength) d1= 0;
1489
        else if(d<-  strength) d1=-2*strength - d;
1490
        else if(d<   strength) d1= d;
1491
        else if(d< 2*strength) d1= 2*strength - d;
1492
        else                   d1= 0;
1493

    
1494
        p1 += d1;
1495
        p2 -= d1;
1496
        if(p1&256) p1= ~(p1>>31);
1497
        if(p2&256) p2= ~(p2>>31);
1498

    
1499
        src[y*stride-1] = p1;
1500
        src[y*stride+0] = p2;
1501

    
1502
        ad1= FFABS(d1)>>1;
1503

    
1504
        d2= av_clip((p0-p3)/4, -ad1, ad1);
1505

    
1506
        src[y*stride-2] = p0 - d2;
1507
        src[y*stride+1] = p3 + d2;
1508
    }
1509
    }
1510
}
1511

    
1512
static void h261_loop_filter_c(uint8_t *src, int stride){
1513
    int x,y,xy,yz;
1514
    int temp[64];
1515

    
1516
    for(x=0; x<8; x++){
1517
        temp[x      ] = 4*src[x           ];
1518
        temp[x + 7*8] = 4*src[x + 7*stride];
1519
    }
1520
    for(y=1; y<7; y++){
1521
        for(x=0; x<8; x++){
1522
            xy = y * stride + x;
1523
            yz = y * 8 + x;
1524
            temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1525
        }
1526
    }
1527

    
1528
    for(y=0; y<8; y++){
1529
        src[  y*stride] = (temp[  y*8] + 2)>>2;
1530
        src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1531
        for(x=1; x<7; x++){
1532
            xy = y * stride + x;
1533
            yz = y * 8 + x;
1534
            src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1535
        }
1536
    }
1537
}
1538

    
1539
static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1540
{
1541
    int s, i;
1542

    
1543
    s = 0;
1544
    for(i=0;i<h;i++) {
1545
        s += abs(pix1[0] - pix2[0]);
1546
        s += abs(pix1[1] - pix2[1]);
1547
        s += abs(pix1[2] - pix2[2]);
1548
        s += abs(pix1[3] - pix2[3]);
1549
        s += abs(pix1[4] - pix2[4]);
1550
        s += abs(pix1[5] - pix2[5]);
1551
        s += abs(pix1[6] - pix2[6]);
1552
        s += abs(pix1[7] - pix2[7]);
1553
        s += abs(pix1[8] - pix2[8]);
1554
        s += abs(pix1[9] - pix2[9]);
1555
        s += abs(pix1[10] - pix2[10]);
1556
        s += abs(pix1[11] - pix2[11]);
1557
        s += abs(pix1[12] - pix2[12]);
1558
        s += abs(pix1[13] - pix2[13]);
1559
        s += abs(pix1[14] - pix2[14]);
1560
        s += abs(pix1[15] - pix2[15]);
1561
        pix1 += line_size;
1562
        pix2 += line_size;
1563
    }
1564
    return s;
1565
}
1566

    
1567
static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1568
{
1569
    int s, i;
1570

    
1571
    s = 0;
1572
    for(i=0;i<h;i++) {
1573
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1574
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1575
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1576
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1577
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1578
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1579
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1580
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1581
        s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1582
        s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1583
        s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1584
        s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1585
        s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1586
        s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1587
        s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1588
        s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1589
        pix1 += line_size;
1590
        pix2 += line_size;
1591
    }
1592
    return s;
1593
}
1594

    
1595
static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1596
{
1597
    int s, i;
1598
    uint8_t *pix3 = pix2 + line_size;
1599

    
1600
    s = 0;
1601
    for(i=0;i<h;i++) {
1602
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1603
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1604
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1605
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1606
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1607
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1608
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1609
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1610
        s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1611
        s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1612
        s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1613
        s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1614
        s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1615
        s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1616
        s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1617
        s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1618
        pix1 += line_size;
1619
        pix2 += line_size;
1620
        pix3 += line_size;
1621
    }
1622
    return s;
1623
}
1624

    
1625
static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1626
{
1627
    int s, i;
1628
    uint8_t *pix3 = pix2 + line_size;
1629

    
1630
    s = 0;
1631
    for(i=0;i<h;i++) {
1632
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1633
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1634
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1635
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1636
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1637
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1638
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1639
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1640
        s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1641
        s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1642
        s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1643
        s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1644
        s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1645
        s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1646
        s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1647
        s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1648
        pix1 += line_size;
1649
        pix2 += line_size;
1650
        pix3 += line_size;
1651
    }
1652
    return s;
1653
}
1654

    
1655
static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1656
{
1657
    int s, i;
1658

    
1659
    s = 0;
1660
    for(i=0;i<h;i++) {
1661
        s += abs(pix1[0] - pix2[0]);
1662
        s += abs(pix1[1] - pix2[1]);
1663
        s += abs(pix1[2] - pix2[2]);
1664
        s += abs(pix1[3] - pix2[3]);
1665
        s += abs(pix1[4] - pix2[4]);
1666
        s += abs(pix1[5] - pix2[5]);
1667
        s += abs(pix1[6] - pix2[6]);
1668
        s += abs(pix1[7] - pix2[7]);
1669
        pix1 += line_size;
1670
        pix2 += line_size;
1671
    }
1672
    return s;
1673
}
1674

    
1675
static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1676
{
1677
    int s, i;
1678

    
1679
    s = 0;
1680
    for(i=0;i<h;i++) {
1681
        s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1682
        s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1683
        s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1684
        s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1685
        s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1686
        s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1687
        s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1688
        s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1689
        pix1 += line_size;
1690
        pix2 += line_size;
1691
    }
1692
    return s;
1693
}
1694

    
1695
static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1696
{
1697
    int s, i;
1698
    uint8_t *pix3 = pix2 + line_size;
1699

    
1700
    s = 0;
1701
    for(i=0;i<h;i++) {
1702
        s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1703
        s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1704
        s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1705
        s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1706
        s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1707
        s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1708
        s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1709
        s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1710
        pix1 += line_size;
1711
        pix2 += line_size;
1712
        pix3 += line_size;
1713
    }
1714
    return s;
1715
}
1716

    
1717
static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1718
{
1719
    int s, i;
1720
    uint8_t *pix3 = pix2 + line_size;
1721

    
1722
    s = 0;
1723
    for(i=0;i<h;i++) {
1724
        s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1725
        s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1726
        s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1727
        s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1728
        s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1729
        s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1730
        s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1731
        s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1732
        pix1 += line_size;
1733
        pix2 += line_size;
1734
        pix3 += line_size;
1735
    }
1736
    return s;
1737
}
1738

    
1739
static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1740
    MpegEncContext *c = v;
1741
    int score1=0;
1742
    int score2=0;
1743
    int x,y;
1744

    
1745
    for(y=0; y<h; y++){
1746
        for(x=0; x<16; x++){
1747
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1748
        }
1749
        if(y+1<h){
1750
            for(x=0; x<15; x++){
1751
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1752
                             - s1[x+1] + s1[x+1+stride])
1753
                        -FFABS(  s2[x  ] - s2[x  +stride]
1754
                             - s2[x+1] + s2[x+1+stride]);
1755
            }
1756
        }
1757
        s1+= stride;
1758
        s2+= stride;
1759
    }
1760

    
1761
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1762
    else  return score1 + FFABS(score2)*8;
1763
}
1764

    
1765
static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1766
    MpegEncContext *c = v;
1767
    int score1=0;
1768
    int score2=0;
1769
    int x,y;
1770

    
1771
    for(y=0; y<h; y++){
1772
        for(x=0; x<8; x++){
1773
            score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
1774
        }
1775
        if(y+1<h){
1776
            for(x=0; x<7; x++){
1777
                score2+= FFABS(  s1[x  ] - s1[x  +stride]
1778
                             - s1[x+1] + s1[x+1+stride])
1779
                        -FFABS(  s2[x  ] - s2[x  +stride]
1780
                             - s2[x+1] + s2[x+1+stride]);
1781
            }
1782
        }
1783
        s1+= stride;
1784
        s2+= stride;
1785
    }
1786

    
1787
    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1788
    else  return score1 + FFABS(score2)*8;
1789
}
1790

    
1791
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1792
    int i;
1793
    unsigned int sum=0;
1794

    
1795
    for(i=0; i<8*8; i++){
1796
        int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1797
        int w= weight[i];
1798
        b>>= RECON_SHIFT;
1799
        assert(-512<b && b<512);
1800

    
1801
        sum += (w*b)*(w*b)>>4;
1802
    }
1803
    return sum>>2;
1804
}
1805

    
1806
static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1807
    int i;
1808

    
1809
    for(i=0; i<8*8; i++){
1810
        rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1811
    }
1812
}
1813

    
1814
/**
1815
 * permutes an 8x8 block.
1816
 * @param block the block which will be permuted according to the given permutation vector
1817
 * @param permutation the permutation vector
1818
 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1819
 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1820
 *                  (inverse) permutated to scantable order!
1821
 */
1822
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1823
{
1824
    int i;
1825
    DCTELEM temp[64];
1826

    
1827
    if(last<=0) return;
1828
    //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1829

    
1830
    for(i=0; i<=last; i++){
1831
        const int j= scantable[i];
1832
        temp[j]= block[j];
1833
        block[j]=0;
1834
    }
1835

    
1836
    for(i=0; i<=last; i++){
1837
        const int j= scantable[i];
1838
        const int perm_j= permutation[j];
1839
        block[perm_j]= temp[j];
1840
    }
1841
}
1842

    
1843
static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1844
    return 0;
1845
}
1846

    
1847
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1848
    int i;
1849

    
1850
    memset(cmp, 0, sizeof(void*)*6);
1851

    
1852
    for(i=0; i<6; i++){
1853
        switch(type&0xFF){
1854
        case FF_CMP_SAD:
1855
            cmp[i]= c->sad[i];
1856
            break;
1857
        case FF_CMP_SATD:
1858
            cmp[i]= c->hadamard8_diff[i];
1859
            break;
1860
        case FF_CMP_SSE:
1861
            cmp[i]= c->sse[i];
1862
            break;
1863
        case FF_CMP_DCT:
1864
            cmp[i]= c->dct_sad[i];
1865
            break;
1866
        case FF_CMP_DCT264:
1867
            cmp[i]= c->dct264_sad[i];
1868
            break;
1869
        case FF_CMP_DCTMAX:
1870
            cmp[i]= c->dct_max[i];
1871
            break;
1872
        case FF_CMP_PSNR:
1873
            cmp[i]= c->quant_psnr[i];
1874
            break;
1875
        case FF_CMP_BIT:
1876
            cmp[i]= c->bit[i];
1877
            break;
1878
        case FF_CMP_RD:
1879
            cmp[i]= c->rd[i];
1880
            break;
1881
        case FF_CMP_VSAD:
1882
            cmp[i]= c->vsad[i];
1883
            break;
1884
        case FF_CMP_VSSE:
1885
            cmp[i]= c->vsse[i];
1886
            break;
1887
        case FF_CMP_ZERO:
1888
            cmp[i]= zero_cmp;
1889
            break;
1890
        case FF_CMP_NSSE:
1891
            cmp[i]= c->nsse[i];
1892
            break;
1893
#if CONFIG_DWT
1894
        case FF_CMP_W53:
1895
            cmp[i]= c->w53[i];
1896
            break;
1897
        case FF_CMP_W97:
1898
            cmp[i]= c->w97[i];
1899
            break;
1900
#endif
1901
        default:
1902
            av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1903
        }
1904
    }
1905
}
1906

    
1907
static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1908
    long i;
1909
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1910
        long a = *(long*)(src+i);
1911
        long b = *(long*)(dst+i);
1912
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1913
    }
1914
    for(; i<w; i++)
1915
        dst[i+0] += src[i+0];
1916
}
1917

    
1918
static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1919
    long i;
1920
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1921
        long a = *(long*)(src1+i);
1922
        long b = *(long*)(src2+i);
1923
        *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1924
    }
1925
    for(; i<w; i++)
1926
        dst[i] = src1[i]+src2[i];
1927
}
1928

    
1929
static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1930
    long i;
1931
#if !HAVE_FAST_UNALIGNED
1932
    if((long)src2 & (sizeof(long)-1)){
1933
        for(i=0; i+7<w; i+=8){
1934
            dst[i+0] = src1[i+0]-src2[i+0];
1935
            dst[i+1] = src1[i+1]-src2[i+1];
1936
            dst[i+2] = src1[i+2]-src2[i+2];
1937
            dst[i+3] = src1[i+3]-src2[i+3];
1938
            dst[i+4] = src1[i+4]-src2[i+4];
1939
            dst[i+5] = src1[i+5]-src2[i+5];
1940
            dst[i+6] = src1[i+6]-src2[i+6];
1941
            dst[i+7] = src1[i+7]-src2[i+7];
1942
        }
1943
    }else
1944
#endif
1945
    for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1946
        long a = *(long*)(src1+i);
1947
        long b = *(long*)(src2+i);
1948
        *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1949
    }
1950
    for(; i<w; i++)
1951
        dst[i+0] = src1[i+0]-src2[i+0];
1952
}
1953

    
1954
static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1955
    int i;
1956
    uint8_t l, lt;
1957

    
1958
    l= *left;
1959
    lt= *left_top;
1960

    
1961
    for(i=0; i<w; i++){
1962
        l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1963
        lt= src1[i];
1964
        dst[i]= l;
1965
    }
1966

    
1967
    *left= l;
1968
    *left_top= lt;
1969
}
1970

    
1971
static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1972
    int i;
1973
    uint8_t l, lt;
1974

    
1975
    l= *left;
1976
    lt= *left_top;
1977

    
1978
    for(i=0; i<w; i++){
1979
        const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1980
        lt= src1[i];
1981
        l= src2[i];
1982
        dst[i]= l - pred;
1983
    }
1984

    
1985
    *left= l;
1986
    *left_top= lt;
1987
}
1988

    
1989
static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1990
    int i;
1991

    
1992
    for(i=0; i<w-1; i++){
1993
        acc+= src[i];
1994
        dst[i]= acc;
1995
        i++;
1996
        acc+= src[i];
1997
        dst[i]= acc;
1998
    }
1999

    
2000
    for(; i<w; i++){
2001
        acc+= src[i];
2002
        dst[i]= acc;
2003
    }
2004

    
2005
    return acc;
2006
}
2007

    
2008
#if HAVE_BIGENDIAN
2009
#define B 3
2010
#define G 2
2011
#define R 1
2012
#define A 0
2013
#else
2014
#define B 0
2015
#define G 1
2016
#define R 2
2017
#define A 3
2018
#endif
2019
static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2020
    int i;
2021
    int r,g,b,a;
2022
    r= *red;
2023
    g= *green;
2024
    b= *blue;
2025
    a= *alpha;
2026

    
2027
    for(i=0; i<w; i++){
2028
        b+= src[4*i+B];
2029
        g+= src[4*i+G];
2030
        r+= src[4*i+R];
2031
        a+= src[4*i+A];
2032

    
2033
        dst[4*i+B]= b;
2034
        dst[4*i+G]= g;
2035
        dst[4*i+R]= r;
2036
        dst[4*i+A]= a;
2037
    }
2038

    
2039
    *red= r;
2040
    *green= g;
2041
    *blue= b;
2042
    *alpha= a;
2043
}
2044
#undef B
2045
#undef G
2046
#undef R
2047
#undef A
2048

    
2049
#define BUTTERFLY2(o1,o2,i1,i2) \
2050
o1= (i1)+(i2);\
2051
o2= (i1)-(i2);
2052

    
2053
#define BUTTERFLY1(x,y) \
2054
{\
2055
    int a,b;\
2056
    a= x;\
2057
    b= y;\
2058
    x= a+b;\
2059
    y= a-b;\
2060
}
2061

    
2062
#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2063

    
2064
static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2065
    int i;
2066
    int temp[64];
2067
    int sum=0;
2068

    
2069
    assert(h==8);
2070

    
2071
    for(i=0; i<8; i++){
2072
        //FIXME try pointer walks
2073
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2074
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2075
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2076
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2077

    
2078
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2079
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2080
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2081
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2082

    
2083
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2084
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2085
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2086
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2087
    }
2088

    
2089
    for(i=0; i<8; i++){
2090
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2091
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2092
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2093
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2094

    
2095
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2096
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2097
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2098
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2099

    
2100
        sum +=
2101
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2102
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2103
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2104
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2105
    }
2106
#if 0
2107
static int maxi=0;
2108
if(sum>maxi){
2109
    maxi=sum;
2110
    printf("MAX:%d\n", maxi);
2111
}
2112
#endif
2113
    return sum;
2114
}
2115

    
2116
static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2117
    int i;
2118
    int temp[64];
2119
    int sum=0;
2120

    
2121
    assert(h==8);
2122

    
2123
    for(i=0; i<8; i++){
2124
        //FIXME try pointer walks
2125
        BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2126
        BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2127
        BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2128
        BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2129

    
2130
        BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2131
        BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2132
        BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2133
        BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2134

    
2135
        BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2136
        BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2137
        BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2138
        BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2139
    }
2140

    
2141
    for(i=0; i<8; i++){
2142
        BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2143
        BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2144
        BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2145
        BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2146

    
2147
        BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2148
        BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2149
        BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2150
        BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2151

    
2152
        sum +=
2153
             BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2154
            +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2155
            +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2156
            +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2157
    }
2158

    
2159
    sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2160

    
2161
    return sum;
2162
}
2163

    
2164
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2165
    MpegEncContext * const s= (MpegEncContext *)c;
2166
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2167

    
2168
    assert(h==8);
2169

    
2170
    s->dsp.diff_pixels(temp, src1, src2, stride);
2171
    s->dsp.fdct(temp);
2172
    return s->dsp.sum_abs_dctelem(temp);
2173
}
2174

    
2175
#if CONFIG_GPL
2176
#define DCT8_1D {\
2177
    const int s07 = SRC(0) + SRC(7);\
2178
    const int s16 = SRC(1) + SRC(6);\
2179
    const int s25 = SRC(2) + SRC(5);\
2180
    const int s34 = SRC(3) + SRC(4);\
2181
    const int a0 = s07 + s34;\
2182
    const int a1 = s16 + s25;\
2183
    const int a2 = s07 - s34;\
2184
    const int a3 = s16 - s25;\
2185
    const int d07 = SRC(0) - SRC(7);\
2186
    const int d16 = SRC(1) - SRC(6);\
2187
    const int d25 = SRC(2) - SRC(5);\
2188
    const int d34 = SRC(3) - SRC(4);\
2189
    const int a4 = d16 + d25 + (d07 + (d07>>1));\
2190
    const int a5 = d07 - d34 - (d25 + (d25>>1));\
2191
    const int a6 = d07 + d34 - (d16 + (d16>>1));\
2192
    const int a7 = d16 - d25 + (d34 + (d34>>1));\
2193
    DST(0,  a0 + a1     ) ;\
2194
    DST(1,  a4 + (a7>>2)) ;\
2195
    DST(2,  a2 + (a3>>1)) ;\
2196
    DST(3,  a5 + (a6>>2)) ;\
2197
    DST(4,  a0 - a1     ) ;\
2198
    DST(5,  a6 - (a5>>2)) ;\
2199
    DST(6, (a2>>1) - a3 ) ;\
2200
    DST(7, (a4>>2) - a7 ) ;\
2201
}
2202

    
2203
static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2204
    MpegEncContext * const s= (MpegEncContext *)c;
2205
    DCTELEM dct[8][8];
2206
    int i;
2207
    int sum=0;
2208

    
2209
    s->dsp.diff_pixels(dct[0], src1, src2, stride);
2210

    
2211
#define SRC(x) dct[i][x]
2212
#define DST(x,v) dct[i][x]= v
2213
    for( i = 0; i < 8; i++ )
2214
        DCT8_1D
2215
#undef SRC
2216
#undef DST
2217

    
2218
#define SRC(x) dct[x][i]
2219
#define DST(x,v) sum += FFABS(v)
2220
    for( i = 0; i < 8; i++ )
2221
        DCT8_1D
2222
#undef SRC
2223
#undef DST
2224
    return sum;
2225
}
2226
#endif
2227

    
2228
static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2229
    MpegEncContext * const s= (MpegEncContext *)c;
2230
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2231
    int sum=0, i;
2232

    
2233
    assert(h==8);
2234

    
2235
    s->dsp.diff_pixels(temp, src1, src2, stride);
2236
    s->dsp.fdct(temp);
2237

    
2238
    for(i=0; i<64; i++)
2239
        sum= FFMAX(sum, FFABS(temp[i]));
2240

    
2241
    return sum;
2242
}
2243

    
2244
static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2245
    MpegEncContext * const s= (MpegEncContext *)c;
2246
    LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2247
    DCTELEM * const bak = temp+64;
2248
    int sum=0, i;
2249

    
2250
    assert(h==8);
2251
    s->mb_intra=0;
2252

    
2253
    s->dsp.diff_pixels(temp, src1, src2, stride);
2254

    
2255
    memcpy(bak, temp, 64*sizeof(DCTELEM));
2256

    
2257
    s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2258
    s->dct_unquantize_inter(s, temp, 0, s->qscale);
2259
    ff_simple_idct(temp); //FIXME
2260

    
2261
    for(i=0; i<64; i++)
2262
        sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2263

    
2264
    return sum;
2265
}
2266

    
2267
static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2268
    MpegEncContext * const s= (MpegEncContext *)c;
2269
    const uint8_t *scantable= s->intra_scantable.permutated;
2270
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2271
    LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2272
    LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2273
    int i, last, run, bits, level, distortion, start_i;
2274
    const int esc_length= s->ac_esc_length;
2275
    uint8_t * length;
2276
    uint8_t * last_length;
2277

    
2278
    assert(h==8);
2279

    
2280
    copy_block8(lsrc1, src1, 8, stride, 8);
2281
    copy_block8(lsrc2, src2, 8, stride, 8);
2282

    
2283
    s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2284

    
2285
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2286

    
2287
    bits=0;
2288

    
2289
    if (s->mb_intra) {
2290
        start_i = 1;
2291
        length     = s->intra_ac_vlc_length;
2292
        last_length= s->intra_ac_vlc_last_length;
2293
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2294
    } else {
2295
        start_i = 0;
2296
        length     = s->inter_ac_vlc_length;
2297
        last_length= s->inter_ac_vlc_last_length;
2298
    }
2299

    
2300
    if(last>=start_i){
2301
        run=0;
2302
        for(i=start_i; i<last; i++){
2303
            int j= scantable[i];
2304
            level= temp[j];
2305

    
2306
            if(level){
2307
                level+=64;
2308
                if((level&(~127)) == 0){
2309
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2310
                }else
2311
                    bits+= esc_length;
2312
                run=0;
2313
            }else
2314
                run++;
2315
        }
2316
        i= scantable[last];
2317

    
2318
        level= temp[i] + 64;
2319

    
2320
        assert(level - 64);
2321

    
2322
        if((level&(~127)) == 0){
2323
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2324
        }else
2325
            bits+= esc_length;
2326

    
2327
    }
2328

    
2329
    if(last>=0){
2330
        if(s->mb_intra)
2331
            s->dct_unquantize_intra(s, temp, 0, s->qscale);
2332
        else
2333
            s->dct_unquantize_inter(s, temp, 0, s->qscale);
2334
    }
2335

    
2336
    s->dsp.idct_add(lsrc2, 8, temp);
2337

    
2338
    distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2339

    
2340
    return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2341
}
2342

    
2343
static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2344
    MpegEncContext * const s= (MpegEncContext *)c;
2345
    const uint8_t *scantable= s->intra_scantable.permutated;
2346
    LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2347
    int i, last, run, bits, level, start_i;
2348
    const int esc_length= s->ac_esc_length;
2349
    uint8_t * length;
2350
    uint8_t * last_length;
2351

    
2352
    assert(h==8);
2353

    
2354
    s->dsp.diff_pixels(temp, src1, src2, stride);
2355

    
2356
    s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2357

    
2358
    bits=0;
2359

    
2360
    if (s->mb_intra) {
2361
        start_i = 1;
2362
        length     = s->intra_ac_vlc_length;
2363
        last_length= s->intra_ac_vlc_last_length;
2364
        bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2365
    } else {
2366
        start_i = 0;
2367
        length     = s->inter_ac_vlc_length;
2368
        last_length= s->inter_ac_vlc_last_length;
2369
    }
2370

    
2371
    if(last>=start_i){
2372
        run=0;
2373
        for(i=start_i; i<last; i++){
2374
            int j= scantable[i];
2375
            level= temp[j];
2376

    
2377
            if(level){
2378
                level+=64;
2379
                if((level&(~127)) == 0){
2380
                    bits+= length[UNI_AC_ENC_INDEX(run, level)];
2381
                }else
2382
                    bits+= esc_length;
2383
                run=0;
2384
            }else
2385
                run++;
2386
        }
2387
        i= scantable[last];
2388

    
2389
        level= temp[i] + 64;
2390

    
2391
        assert(level - 64);
2392

    
2393
        if((level&(~127)) == 0){
2394
            bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2395
        }else
2396
            bits+= esc_length;
2397
    }
2398

    
2399
    return bits;
2400
}
2401

    
2402
#define VSAD_INTRA(size) \
2403
static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2404
    int score=0;                                                                                            \
2405
    int x,y;                                                                                                \
2406
                                                                                                            \
2407
    for(y=1; y<h; y++){                                                                                     \
2408
        for(x=0; x<size; x+=4){                                                                             \
2409
            score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
2410
                   +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
2411
        }                                                                                                   \
2412
        s+= stride;                                                                                         \
2413
    }                                                                                                       \
2414
                                                                                                            \
2415
    return score;                                                                                           \
2416
}
2417
VSAD_INTRA(8)
2418
VSAD_INTRA(16)
2419

    
2420
static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2421
    int score=0;
2422
    int x,y;
2423

    
2424
    for(y=1; y<h; y++){
2425
        for(x=0; x<16; x++){
2426
            score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2427
        }
2428
        s1+= stride;
2429
        s2+= stride;
2430
    }
2431

    
2432
    return score;
2433
}
2434

    
2435
#define SQ(a) ((a)*(a))
2436
#define VSSE_INTRA(size) \
2437
static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2438
    int score=0;                                                                                            \
2439
    int x,y;                                                                                                \
2440
                                                                                                            \
2441
    for(y=1; y<h; y++){                                                                                     \
2442
        for(x=0; x<size; x+=4){                                                                               \
2443
            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
2444
                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
2445
        }                                                                                                   \
2446
        s+= stride;                                                                                         \
2447
    }                                                                                                       \
2448
                                                                                                            \
2449
    return score;                                                                                           \
2450
}
2451
VSSE_INTRA(8)
2452
VSSE_INTRA(16)
2453

    
2454
static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2455
    int score=0;
2456
    int x,y;
2457

    
2458
    for(y=1; y<h; y++){
2459
        for(x=0; x<16; x++){
2460
            score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
2461
        }
2462
        s1+= stride;
2463
        s2+= stride;
2464
    }
2465

    
2466
    return score;
2467
}
2468

    
2469
static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2470
                               int size){
2471
    int score=0;
2472
    int i;
2473
    for(i=0; i<size; i++)
2474
        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2475
    return score;
2476
}
2477

    
2478
WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2479
WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2480
WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2481
#if CONFIG_GPL
2482
WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2483
#endif
2484
WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2485
WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2486
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2487
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2488

    
2489
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2490
    int i;
2491
    for(i=0; i<len; i++)
2492
        dst[i] = src0[i] * src1[i];
2493
}
2494

    
2495
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2496
    int i;
2497
    src1 += len-1;
2498
    for(i=0; i<len; i++)
2499
        dst[i] = src0[i] * src1[-i];
2500
}
2501

    
2502
static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2503
    int i;
2504
    for(i=0; i<len; i++)
2505
        dst[i] = src0[i] * src1[i] + src2[i];
2506
}
2507

    
2508
static void vector_fmul_window_c(float *dst, const float *src0,
2509
                                 const float *src1, const float *win, int len)
2510
{
2511
    int i,j;
2512
    dst += len;
2513
    win += len;
2514
    src0+= len;
2515
    for(i=-len, j=len-1; i<0; i++, j--) {
2516
        float s0 = src0[i];
2517
        float s1 = src1[j];
2518
        float wi = win[i];
2519
        float wj = win[j];
2520
        dst[i] = s0*wj - s1*wi;
2521
        dst[j] = s0*wi + s1*wj;
2522
    }
2523
}
2524

    
2525
static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2526
                                 int len)
2527
{
2528
    int i;
2529
    for (i = 0; i < len; i++)
2530
        dst[i] = src[i] * mul;
2531
}
2532

    
2533
static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2534
                                      const float **sv, float mul, int len)
2535
{
2536
    int i;
2537
    for (i = 0; i < len; i += 2, sv++) {
2538
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2539
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2540
    }
2541
}
2542

    
2543
static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2544
                                      const float **sv, float mul, int len)
2545
{
2546
    int i;
2547
    for (i = 0; i < len; i += 4, sv++) {
2548
        dst[i  ] = src[i  ] * sv[0][0] * mul;
2549
        dst[i+1] = src[i+1] * sv[0][1] * mul;
2550
        dst[i+2] = src[i+2] * sv[0][2] * mul;
2551
        dst[i+3] = src[i+3] * sv[0][3] * mul;
2552
    }
2553
}
2554

    
2555
static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2556
                               int len)
2557
{
2558
    int i;
2559
    for (i = 0; i < len; i += 2, sv++) {
2560
        dst[i  ] = sv[0][0] * mul;
2561
        dst[i+1] = sv[0][1] * mul;
2562
    }
2563
}
2564

    
2565
static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2566
                               int len)
2567
{
2568
    int i;
2569
    for (i = 0; i < len; i += 4, sv++) {
2570
        dst[i  ] = sv[0][0] * mul;
2571
        dst[i+1] = sv[0][1] * mul;
2572
        dst[i+2] = sv[0][2] * mul;
2573
        dst[i+3] = sv[0][3] * mul;
2574
    }
2575
}
2576

    
2577
static void butterflies_float_c(float *restrict v1, float *restrict v2,
2578
                                int len)
2579
{
2580
    int i;
2581
    for (i = 0; i < len; i++) {
2582
        float t = v1[i] - v2[i];
2583
        v1[i] += v2[i];
2584
        v2[i] = t;
2585
    }
2586
}
2587

    
2588
static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2589
{
2590
    float p = 0.0;
2591
    int i;
2592

    
2593
    for (i = 0; i < len; i++)
2594
        p += v1[i] * v2[i];
2595

    
2596
    return p;
2597
}
2598

    
2599
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2600
                   uint32_t maxi, uint32_t maxisign)
2601
{
2602

    
2603
    if(a > mini) return mini;
2604
    else if((a^(1<<31)) > maxisign) return maxi;
2605
    else return a;
2606
}
2607

    
2608
static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2609
    int i;
2610
    uint32_t mini = *(uint32_t*)min;
2611
    uint32_t maxi = *(uint32_t*)max;
2612
    uint32_t maxisign = maxi ^ (1<<31);
2613
    uint32_t *dsti = (uint32_t*)dst;
2614
    const uint32_t *srci = (const uint32_t*)src;
2615
    for(i=0; i<len; i+=8) {
2616
        dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2617
        dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2618
        dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2619
        dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2620
        dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2621
        dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2622
        dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2623
        dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2624
    }
2625
}
2626
static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2627
    int i;
2628
    if(min < 0 && max > 0) {
2629
        vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2630
    } else {
2631
        for(i=0; i < len; i+=8) {
2632
            dst[i    ] = av_clipf(src[i    ], min, max);
2633
            dst[i + 1] = av_clipf(src[i + 1], min, max);
2634
            dst[i + 2] = av_clipf(src[i + 2], min, max);
2635
            dst[i + 3] = av_clipf(src[i + 3], min, max);
2636
            dst[i + 4] = av_clipf(src[i + 4], min, max);
2637
            dst[i + 5] = av_clipf(src[i + 5], min, max);
2638
            dst[i + 6] = av_clipf(src[i + 6], min, max);
2639
            dst[i + 7] = av_clipf(src[i + 7], min, max);
2640
        }
2641
    }
2642
}
2643

    
2644
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2645
{
2646
    int res = 0;
2647

    
2648
    while (order--)
2649
        res += (*v1++ * *v2++) >> shift;
2650

    
2651
    return res;
2652
}
2653

    
2654
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2655
{
2656
    int res = 0;
2657
    while (order--) {
2658
        res   += *v1 * *v2++;
2659
        *v1++ += mul * *v3++;
2660
    }
2661
    return res;
2662
}
2663

    
2664
static void apply_window_int16_c(int16_t *output, const int16_t *input,
2665
                                 const int16_t *window, unsigned int len)
2666
{
2667
    int i;
2668
    int len2 = len >> 1;
2669

    
2670
    for (i = 0; i < len2; i++) {
2671
        int16_t w       = window[i];
2672
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
2673
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2674
    }
2675
}
2676

    
2677
#define W0 2048
2678
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2679
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2680
#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2681
#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2682
#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2683
#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2684
#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
2685

    
2686
static void wmv2_idct_row(short * b)
2687
{
2688
    int s1,s2;
2689
    int a0,a1,a2,a3,a4,a5,a6,a7;
2690
    /*step 1*/
2691
    a1 = W1*b[1]+W7*b[7];
2692
    a7 = W7*b[1]-W1*b[7];
2693
    a5 = W5*b[5]+W3*b[3];
2694
    a3 = W3*b[5]-W5*b[3];
2695
    a2 = W2*b[2]+W6*b[6];
2696
    a6 = W6*b[2]-W2*b[6];
2697
    a0 = W0*b[0]+W0*b[4];
2698
    a4 = W0*b[0]-W0*b[4];
2699
    /*step 2*/
2700
    s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2701
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2702
    /*step 3*/
2703
    b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2704
    b[1] = (a4+a6 +s1   + (1<<7))>>8;
2705
    b[2] = (a4-a6 +s2   + (1<<7))>>8;
2706
    b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2707
    b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2708
    b[5] = (a4-a6 -s2   + (1<<7))>>8;
2709
    b[6] = (a4+a6 -s1   + (1<<7))>>8;
2710
    b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2711
}
2712
static void wmv2_idct_col(short * b)
2713
{
2714
    int s1,s2;
2715
    int a0,a1,a2,a3,a4,a5,a6,a7;
2716
    /*step 1, with extended precision*/
2717
    a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2718
    a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2719
    a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2720
    a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2721
    a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2722
    a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2723
    a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
2724
    a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
2725
    /*step 2*/
2726
    s1 = (181*(a1-a5+a7-a3)+128)>>8;
2727
    s2 = (181*(a1-a5-a7+a3)+128)>>8;
2728
    /*step 3*/
2729
    b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2730
    b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
2731
    b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
2732
    b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2733

    
2734
    b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2735
    b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
2736
    b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
2737
    b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2738
}
2739
void ff_wmv2_idct_c(short * block){
2740
    int i;
2741

    
2742
    for(i=0;i<64;i+=8){
2743
        wmv2_idct_row(block+i);
2744
    }
2745
    for(i=0;i<8;i++){
2746
        wmv2_idct_col(block+i);
2747
    }
2748
}
2749
/* XXX: those functions should be suppressed ASAP when all IDCTs are
2750
 converted */
2751
static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2752
{
2753
    ff_wmv2_idct_c(block);
2754
    ff_put_pixels_clamped_c(block, dest, line_size);
2755
}
2756
static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2757
{
2758
    ff_wmv2_idct_c(block);
2759
    ff_add_pixels_clamped_c(block, dest, line_size);
2760
}
2761
static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2762
{
2763
    j_rev_dct (block);
2764
    ff_put_pixels_clamped_c(block, dest, line_size);
2765
}
2766
static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2767
{
2768
    j_rev_dct (block);
2769
    ff_add_pixels_clamped_c(block, dest, line_size);
2770
}
2771

    
2772
static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2773
{
2774
    j_rev_dct4 (block);
2775
    put_pixels_clamped4_c(block, dest, line_size);
2776
}
2777
static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2778
{
2779
    j_rev_dct4 (block);
2780
    add_pixels_clamped4_c(block, dest, line_size);
2781
}
2782

    
2783
static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2784
{
2785
    j_rev_dct2 (block);
2786
    put_pixels_clamped2_c(block, dest, line_size);
2787
}
2788
static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2789
{
2790
    j_rev_dct2 (block);
2791
    add_pixels_clamped2_c(block, dest, line_size);
2792
}
2793

    
2794
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2795
{
2796
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2797

    
2798
    dest[0] = cm[(block[0] + 4)>>3];
2799
}
2800
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2801
{
2802
    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2803

    
2804
    dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2805
}
2806

    
2807
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2808

    
2809
/* init static data */
2810
av_cold void dsputil_static_init(void)
2811
{
2812
    int i;
2813

    
2814
    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2815
    for(i=0;i<MAX_NEG_CROP;i++) {
2816
        ff_cropTbl[i] = 0;
2817
        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2818
    }
2819

    
2820
    for(i=0;i<512;i++) {
2821
        ff_squareTbl[i] = (i - 256) * (i - 256);
2822
    }
2823

    
2824
    for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2825
}
2826

    
2827
int ff_check_alignment(void){
2828
    static int did_fail=0;
2829
    DECLARE_ALIGNED(16, int, aligned);
2830

    
2831
    if((intptr_t)&aligned & 15){
2832
        if(!did_fail){
2833
#if HAVE_MMX || HAVE_ALTIVEC
2834
            av_log(NULL, AV_LOG_ERROR,
2835
                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2836
                "and may be very slow or crash. This is not a bug in libavcodec,\n"
2837
                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2838
                "Do not report crashes to FFmpeg developers.\n");
2839
#endif
2840
            did_fail=1;
2841
        }
2842
        return -1;
2843
    }
2844
    return 0;
2845
}
2846

    
2847
av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2848
{
2849
    int i;
2850

    
2851
    ff_check_alignment();
2852

    
2853
#if CONFIG_ENCODERS
2854
    if(avctx->dct_algo==FF_DCT_FASTINT) {
2855
        c->fdct = fdct_ifast;
2856
        c->fdct248 = fdct_ifast248;
2857
    }
2858
    else if(avctx->dct_algo==FF_DCT_FAAN) {
2859
        c->fdct = ff_faandct;
2860
        c->fdct248 = ff_faandct248;
2861
    }
2862
    else {
2863
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2864
        c->fdct248 = ff_fdct248_islow;
2865
    }
2866
#endif //CONFIG_ENCODERS
2867

    
2868
    if(avctx->lowres==1){
2869
        if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2870
            c->idct_put= ff_jref_idct4_put;
2871
            c->idct_add= ff_jref_idct4_add;
2872
        }else{
2873
            c->idct_put= ff_h264_lowres_idct_put_c;
2874
            c->idct_add= ff_h264_lowres_idct_add_c;
2875
        }
2876
        c->idct    = j_rev_dct4;
2877
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2878
    }else if(avctx->lowres==2){
2879
        c->idct_put= ff_jref_idct2_put;
2880
        c->idct_add= ff_jref_idct2_add;
2881
        c->idct    = j_rev_dct2;
2882
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2883
    }else if(avctx->lowres==3){
2884
        c->idct_put= ff_jref_idct1_put;
2885
        c->idct_add= ff_jref_idct1_add;
2886
        c->idct    = j_rev_dct1;
2887
        c->idct_permutation_type= FF_NO_IDCT_PERM;
2888
    }else{
2889
        if(avctx->idct_algo==FF_IDCT_INT){
2890
            c->idct_put= ff_jref_idct_put;
2891
            c->idct_add= ff_jref_idct_add;
2892
            c->idct    = j_rev_dct;
2893
            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2894
        }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2895
                avctx->idct_algo==FF_IDCT_VP3){
2896
            c->idct_put= ff_vp3_idct_put_c;
2897
            c->idct_add= ff_vp3_idct_add_c;
2898
            c->idct    = ff_vp3_idct_c;
2899
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2900
        }else if(avctx->idct_algo==FF_IDCT_WMV2){
2901
            c->idct_put= ff_wmv2_idct_put_c;
2902
            c->idct_add= ff_wmv2_idct_add_c;
2903
            c->idct    = ff_wmv2_idct_c;
2904
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2905
        }else if(avctx->idct_algo==FF_IDCT_FAAN){
2906
            c->idct_put= ff_faanidct_put;
2907
            c->idct_add= ff_faanidct_add;
2908
            c->idct    = ff_faanidct;
2909
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2910
        }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2911
            c->idct_put= ff_ea_idct_put_c;
2912
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2913
        }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2914
            c->idct     = ff_bink_idct_c;
2915
            c->idct_add = ff_bink_idct_add_c;
2916
            c->idct_put = ff_bink_idct_put_c;
2917
            c->idct_permutation_type = FF_NO_IDCT_PERM;
2918
        }else{ //accurate/default
2919
            c->idct_put= ff_simple_idct_put;
2920
            c->idct_add= ff_simple_idct_add;
2921
            c->idct    = ff_simple_idct;
2922
            c->idct_permutation_type= FF_NO_IDCT_PERM;
2923
        }
2924
    }
2925

    
2926
    c->get_pixels = get_pixels_c;
2927
    c->diff_pixels = diff_pixels_c;
2928
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
2929
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2930
    c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2931
    c->add_pixels_clamped = ff_add_pixels_clamped_c;
2932
    c->add_pixels8 = add_pixels8_c;
2933
    c->add_pixels4 = add_pixels4_c;
2934
    c->sum_abs_dctelem = sum_abs_dctelem_c;
2935
    c->emulated_edge_mc = ff_emulated_edge_mc;
2936
    c->gmc1 = gmc1_c;
2937
    c->gmc = ff_gmc_c;
2938
    c->clear_block = clear_block_c;
2939
    c->clear_blocks = clear_blocks_c;
2940
    c->pix_sum = pix_sum_c;
2941
    c->pix_norm1 = pix_norm1_c;
2942

    
2943
    c->fill_block_tab[0] = fill_block16_c;
2944
    c->fill_block_tab[1] = fill_block8_c;
2945
    c->scale_block = scale_block_c;
2946

    
2947
    /* TODO [0] 16  [1] 8 */
2948
    c->pix_abs[0][0] = pix_abs16_c;
2949
    c->pix_abs[0][1] = pix_abs16_x2_c;
2950
    c->pix_abs[0][2] = pix_abs16_y2_c;
2951
    c->pix_abs[0][3] = pix_abs16_xy2_c;
2952
    c->pix_abs[1][0] = pix_abs8_c;
2953
    c->pix_abs[1][1] = pix_abs8_x2_c;
2954
    c->pix_abs[1][2] = pix_abs8_y2_c;
2955
    c->pix_abs[1][3] = pix_abs8_xy2_c;
2956

    
2957
#define dspfunc(PFX, IDX, NUM) \
2958
    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
2959
    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
2960
    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
2961
    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2962

    
2963
    dspfunc(put, 0, 16);
2964
    dspfunc(put_no_rnd, 0, 16);
2965
    dspfunc(put, 1, 8);
2966
    dspfunc(put_no_rnd, 1, 8);
2967
    dspfunc(put, 2, 4);
2968
    dspfunc(put, 3, 2);
2969

    
2970
    dspfunc(avg, 0, 16);
2971
    dspfunc(avg_no_rnd, 0, 16);
2972
    dspfunc(avg, 1, 8);
2973
    dspfunc(avg_no_rnd, 1, 8);
2974
    dspfunc(avg, 2, 4);
2975
    dspfunc(avg, 3, 2);
2976
#undef dspfunc
2977

    
2978
    c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
2979
    c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
2980

    
2981
    c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2982
    c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2983
    c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2984
    c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2985
    c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2986
    c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2987
    c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2988
    c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2989
    c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2990

    
2991
    c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2992
    c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2993
    c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2994
    c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2995
    c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2996
    c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2997
    c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2998
    c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2999
    c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3000

    
3001
#define dspfunc(PFX, IDX, NUM) \
3002
    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3003
    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3004
    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3005
    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3006
    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3007
    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3008
    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3009
    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3010
    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3011
    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3012
    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3013
    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3014
    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3015
    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3016
    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3017
    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3018

    
3019
    dspfunc(put_qpel, 0, 16);
3020
    dspfunc(put_no_rnd_qpel, 0, 16);
3021

    
3022
    dspfunc(avg_qpel, 0, 16);
3023
    /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3024

    
3025
    dspfunc(put_qpel, 1, 8);
3026
    dspfunc(put_no_rnd_qpel, 1, 8);
3027

    
3028
    dspfunc(avg_qpel, 1, 8);
3029
    /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3030

    
3031
    dspfunc(put_h264_qpel, 0, 16);
3032
    dspfunc(put_h264_qpel, 1, 8);
3033
    dspfunc(put_h264_qpel, 2, 4);
3034
    dspfunc(put_h264_qpel, 3, 2);
3035
    dspfunc(avg_h264_qpel, 0, 16);
3036
    dspfunc(avg_h264_qpel, 1, 8);
3037
    dspfunc(avg_h264_qpel, 2, 4);
3038

    
3039
#undef dspfunc
3040
    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3041
    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3042
    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3043
    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3044
    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3045
    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3046

    
3047
    c->draw_edges = draw_edges_c;
3048

    
3049
#if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3050
    ff_mlp_init(c, avctx);
3051
#endif
3052
#if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3053
    ff_intrax8dsp_init(c,avctx);
3054
#endif
3055
#if CONFIG_RV30_DECODER
3056
    ff_rv30dsp_init(c,avctx);
3057
#endif
3058
#if CONFIG_RV40_DECODER
3059
    ff_rv40dsp_init(c,avctx);
3060
    c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3061
    c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3062
    c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3063
    c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3064
#endif
3065

    
3066
    c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3067
    c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3068
    c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3069
    c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3070
    c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3071
    c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3072
    c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3073
    c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3074

    
3075
#define SET_CMP_FUNC(name) \
3076
    c->name[0]= name ## 16_c;\
3077
    c->name[1]= name ## 8x8_c;
3078

    
3079
    SET_CMP_FUNC(hadamard8_diff)
3080
    c->hadamard8_diff[4]= hadamard8_intra16_c;
3081
    c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3082
    SET_CMP_FUNC(dct_sad)
3083
    SET_CMP_FUNC(dct_max)
3084
#if CONFIG_GPL
3085
    SET_CMP_FUNC(dct264_sad)
3086
#endif
3087
    c->sad[0]= pix_abs16_c;
3088
    c->sad[1]= pix_abs8_c;
3089
    c->sse[0]= sse16_c;
3090
    c->sse[1]= sse8_c;
3091
    c->sse[2]= sse4_c;
3092
    SET_CMP_FUNC(quant_psnr)
3093
    SET_CMP_FUNC(rd)
3094
    SET_CMP_FUNC(bit)
3095
    c->vsad[0]= vsad16_c;
3096
    c->vsad[4]= vsad_intra16_c;
3097
    c->vsad[5]= vsad_intra8_c;
3098
    c->vsse[0]= vsse16_c;
3099
    c->vsse[4]= vsse_intra16_c;
3100
    c->vsse[5]= vsse_intra8_c;
3101
    c->nsse[0]= nsse16_c;
3102
    c->nsse[1]= nsse8_c;
3103
#if CONFIG_DWT
3104
    ff_dsputil_init_dwt(c);
3105
#endif
3106

    
3107
    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3108

    
3109
    c->add_bytes= add_bytes_c;
3110
    c->add_bytes_l2= add_bytes_l2_c;
3111
    c->diff_bytes= diff_bytes_c;
3112
    c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3113
    c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3114
    c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
3115
    c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3116
    c->bswap_buf= bswap_buf;
3117
    c->bswap16_buf = bswap16_buf;
3118
#if CONFIG_PNG_DECODER
3119
    c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3120
#endif
3121

    
3122
    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3123
        c->h263_h_loop_filter= h263_h_loop_filter_c;
3124
        c->h263_v_loop_filter= h263_v_loop_filter_c;
3125
    }
3126

    
3127
    if (CONFIG_VP3_DECODER) {
3128
        c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3129
        c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3130
        c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3131
    }
3132

    
3133
    c->h261_loop_filter= h261_loop_filter_c;
3134

    
3135
    c->try_8x8basis= try_8x8basis_c;
3136
    c->add_8x8basis= add_8x8basis_c;
3137

    
3138
#if CONFIG_VORBIS_DECODER
3139
    c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3140
#endif
3141
#if CONFIG_AC3_DECODER
3142
    c->ac3_downmix = ff_ac3_downmix_c;
3143
#endif
3144
    c->vector_fmul = vector_fmul_c;
3145
    c->vector_fmul_reverse = vector_fmul_reverse_c;
3146
    c->vector_fmul_add = vector_fmul_add_c;
3147
    c->vector_fmul_window = vector_fmul_window_c;
3148
    c->vector_clipf = vector_clipf_c;
3149
    c->scalarproduct_int16 = scalarproduct_int16_c;
3150
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3151
    c->apply_window_int16 = apply_window_int16_c;
3152
    c->scalarproduct_float = scalarproduct_float_c;
3153
    c->butterflies_float = butterflies_float_c;
3154
    c->vector_fmul_scalar = vector_fmul_scalar_c;
3155

    
3156
    c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3157
    c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3158

    
3159
    c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3160
    c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3161

    
3162
    c->shrink[0]= av_image_copy_plane;
3163
    c->shrink[1]= ff_shrink22;
3164
    c->shrink[2]= ff_shrink44;
3165
    c->shrink[3]= ff_shrink88;
3166

    
3167
    c->prefetch= just_return;
3168

    
3169
    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3170
    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3171

    
3172
    if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
3173
    if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
3174
    if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
3175
    if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
3176
    if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
3177
    if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
3178
    if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
3179
    if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
3180
    if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
3181

    
3182
    for(i=0; i<64; i++){
3183
        if(!c->put_2tap_qpel_pixels_tab[0][i])
3184
            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3185
        if(!c->avg_2tap_qpel_pixels_tab[0][i])
3186
            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3187
    }
3188

    
3189
    c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3190
    c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3191
    c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3192
    c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3193

    
3194
    c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3195
    c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3196
    c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3197
    c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3198

    
3199
    switch(c->idct_permutation_type){
3200
    case FF_NO_IDCT_PERM:
3201
        for(i=0; i<64; i++)
3202
            c->idct_permutation[i]= i;
3203
        break;
3204
    case FF_LIBMPEG2_IDCT_PERM:
3205
        for(i=0; i<64; i++)
3206
            c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3207
        break;
3208
    case FF_SIMPLE_IDCT_PERM:
3209
        for(i=0; i<64; i++)
3210
            c->idct_permutation[i]= simple_mmx_permutation[i];
3211
        break;
3212
    case FF_TRANSPOSE_IDCT_PERM:
3213
        for(i=0; i<64; i++)
3214
            c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3215
        break;
3216
    case FF_PARTTRANS_IDCT_PERM:
3217
        for(i=0; i<64; i++)
3218
            c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3219
        break;
3220
    case FF_SSE2_IDCT_PERM:
3221
        for(i=0; i<64; i++)
3222
            c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3223
        break;
3224
    default:
3225
        av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3226
    }
3227
}
3228